library(Seurat)
library(harmony)
library(magrittr)
library(gridExtra)
library(ggthemes)
# library(monocle)
library(reticulate)
library(ggbeeswarm)
library(ggrepel)
library(plyr)
library(ggplot2)
library(gridExtra)
library(ggthemes)
library(dplyr)
library(Seurat)
library(RColorBrewer)
library(ggbeeswarm)
library(ggrepel)
library(MASS)
library(matrixStats)
library(viridis)
library(grid)
meta_colors <- list(
"tissue" = c(
"blood" = "#66C2A5",
"marrow" = "#FC8D62",
"TF" = "#80B1D3",
"Cord blood" = "#66C2A5",
"Bone marrow" = "#FC8D62",
"IBD" = "#A6761D", # IBD
"healthy (IBD)" = "#E6AB02", # healthy vs IBD
"healthy (SLE)" = "#B2DF8A",
"healthy (PF)" = "#A6CEE3", # healthy vs Pulmonary Fibrosis
"PF" = "#1F78B4",
"Crohn" = "#6A3D9A",
"SLE" = "#33A02C",
"RA" = "#E31A1C",
"OA" = "#FB9A99"
),
"source" = c(
"tissue" = "#BD0026",
"PBMC" = "#7570B3",
"fluid" = "#FEB24C"
),
"cluster" = c(
"0" = "#E41A1C",
"1" = "#377EB8",
"2" = "#4DAF4A",
"3" = "#984EA3",
"4" = "#FF7F00",
"5" = "#FFFF33",
"6" = "#A65628",
"7" = "#F781BF",
"8" = "#999999",
"9" = "#66C2A5",
"10" = "#FC8D62",
"11" = "#8DA0CB",
"12" = "#E78AC3",
"13" = "#A6D854",
"14" = "#FFD92F",
"15" = "#E5C494",
"16" = "#B3B3B3",
"17" = "#E41A1C",
"18" = "#377EB8",
"19" = "#4DAF4A",
"20" = "#984EA3",
"21" = "#FF7F00",
"22" = "#FFFF33"
)
)
suppressMessages({source("/data/srlab/ik936/Foxxy/utils/libraries.R")
library(viridis)
library(data.table)
library(dplyr)
library(reticulate)
library(Rcpp)
library(scales)
library(irlba)
library(Seurat,lib.loc = "~/R/x86_64-pc-linux-gnu-library/3.4/")
source("/data/srlab/ik936/Foxxy/utils/utils.R")
source("/data/srlab/anathan/scripts/scseq_utils.R")
library(patchwork)
library(presto)
# library(uwot)
})
library(umap)
Sys.getenv("R_LIBS_USER")
# Functions
# FindVariableGenesBatch <- function(exprs_mat, meta_df, cell_col = "cell_id",
# batch_col = "sample_id", ngenes_use = 1e3, expr_min = .1) {
# x_res <- split(meta_df[[cell_col]], meta_df[[batch_col]]) %>% lapply(function(x) {
# FindVariableGenesSeurat(exprs_mat[, x]) %>%
# subset(gene.mean >= expr_min) %>%
# tibble::rownames_to_column("gene") %>%
# dplyr::arrange(-gene.dispersion) %>%
# head(ngenes_use)
# })
# data.table(Reduce(rbind, x_res))[, .N, by = gene][order(-N)]
# }
FindVariableGenesBatch <- function(exprs_mat, meta_df, genes_exclude = NULL, ngenes_use = 1e3, expr_min = .1) {
if (!is.null(genes_exclude)) {
genes_use <- setdiff(row.names(exprs_mat), genes_exclude)
}
x_res <- split(meta_df$cell, meta_df$sample) %>% lapply(function(x) {
FindVariableGenesSeurat(exprs_mat[genes_use, x]) %>%
subset(gene.mean >= expr_min) %>%
tibble::rownames_to_column("gene") %>%
dplyr::arrange(-gene.dispersion) %>%
head(ngenes_use)
})
data.table(Reduce(rbind, x_res))[, .N, by = gene][order(-N)]
}
FindVariableGenesSeurat <- function (data, x.low.cutoff = 0.1, x.high.cutoff = 8,
y.cutoff = 1, y.high.cutoff = Inf, num.bin = 0,
binning.method = "equal_width", sort.results = TRUE,
display.progress = TRUE, ...)
{
genes.use <- rownames(data)
if (class(data) != "dgCMatrix") {
data <- as(as.matrix(data), "dgCMatrix")
}
## (1) get means and variances
gene.mean <- FastExpMean(data, display.progress)
names(gene.mean) <- genes.use
gene.dispersion <- FastLogVMR(data, display.progress)
names(gene.dispersion) <- genes.use
gene.dispersion[is.na(x = gene.dispersion)] <- 0
gene.mean[is.na(x = gene.mean)] <- 0
mv.df <- data.frame(gene.mean, gene.dispersion)
rownames(mv.df) <- rownames(data)
## (OPTIONAL) do the binning correction
if (num.bin > 0) {
if (binning.method == "equal_width") {
data_x_bin <- cut(x = gene.mean, breaks = num.bin)
}
else if (binning.method == "equal_frequency") {
data_x_bin <- cut(x = gene.mean, breaks = c(-1, quantile(gene.mean[gene.mean >
0], probs = seq(0, 1, length.out = num.bin))))
}
else {
stop(paste0("Invalid selection: '", binning.method,
"' for 'binning.method'."))
}
names(x = data_x_bin) <- names(x = gene.mean)
mean_y <- tapply(X = gene.dispersion, INDEX = data_x_bin,
FUN = mean)
sd_y <- tapply(X = gene.dispersion, INDEX = data_x_bin,
FUN = sd)
gene.dispersion.scaled <- (gene.dispersion - mean_y[as.numeric(x = data_x_bin)])/sd_y[as.numeric(x = data_x_bin)]
gene.dispersion.scaled[is.na(x = gene.dispersion.scaled)] <- 0
##names(gene.dispersion.scaled) <- names(gene.mean)
mv.df$gene.dispersion.scaled <- gene.dispersion.scaled
}
return(mv.df)
}
environment(FindVariableGenesSeurat) <- asNamespace("Seurat")
ScaleDataSeurat <- function (data.use, margin = 1, scale.max = 10,
block.size = 1000) {
if (margin == 2) data.use %<>% t
max.block <- ceiling(nrow(data.use)/block.size)
## Define data and functions to use in sparse and dense cases
if (class(data.use) == "dgCMatrix" | class(data.use) == "dgTMatrix") {
scale_fxn <- function(x) {
FastSparseRowScale(mat = x, scale = TRUE, center = TRUE,
scale_max = scale.max, display_progress = FALSE)
}
} else {
scale_fxn <- function(x) {
FastRowScale(mat = x, scale = TRUE, center = TRUE,
scale_max = scale.max, display_progress = FALSE)
}
data.use <- as.matrix(data.use)
}
## Do scaling, at once or in chunks
if (max.block == 1) {
scaled.data <- scale_fxn(data.use)
} else {
scaled.data <- matrix(NA, nrow(data.use), ncol(data.use))
for (i in 1:max.block) {
idx.min <- (block.size * (i - 1))
idx.max <- min(nrow(data.use), (block.size * i - 1) + 1)
my.inds <- idx.min:idx.max
scaled.data[my.inds, ] <- scale_fxn(data.use[my.inds, , drop = F])
}
}
colnames(scaled.data) <- colnames(data.use)
row.names(scaled.data) <- row.names(data.use)
scaled.data[is.na(scaled.data)] <- 0
if (margin == 2) scaled.data %<>% t
return(scaled.data)
}
environment(ScaleDataSeurat) <- asNamespace("Seurat")
fig.size <- function(height, width) {
options(repr.plot.height = height, repr.plot.width = width)
}
SingleFeaturePlotSeurat <- function (data.use, feature, data.plot, pt.size, pch.use, cols.use,
dim.codes, min.cutoff, max.cutoff, coord.fixed, no.axes,
no.title = FALSE, no.legend, dark.theme, vector.friendly = FALSE,
png.file = NULL, png.arguments = c(10, 10, 100))
{
if (vector.friendly) {
previous_call <- blank_call <- png_call <- match.call()
blank_call$pt.size <- -1
blank_call$vector.friendly <- FALSE
png_call$no.axes <- TRUE
png_call$no.legend <- TRUE
png_call$vector.friendly <- FALSE
png_call$no.title <- TRUE
blank_plot <- eval(blank_call, sys.frame(sys.parent()))
png_plot <- eval(png_call, sys.frame(sys.parent()))
png.file <- SetIfNull(x = png.file, default = paste0(tempfile(),
".png"))
ggsave(filename = png.file, plot = png_plot, width = png.arguments[1],
height = png.arguments[2], dpi = png.arguments[3])
to_return <- AugmentPlot(blank_plot, png.file)
file.remove(png.file)
return(to_return)
}
idx.keep <- which(!is.na(data.use[feature, ]))
data.gene <- data.frame(data.use[feature, idx.keep])
# data.gene <- na.omit(object = data.frame(data.use[feature,
# ]))
min.cutoff <- SetQuantile(cutoff = min.cutoff, data = data.gene)
max.cutoff <- SetQuantile(cutoff = max.cutoff, data = data.gene)
data.gene <- sapply(X = data.gene, FUN = function(x) {
return(ifelse(test = x < min.cutoff, yes = min.cutoff,
no = x))
})
data.gene <- sapply(X = data.gene, FUN = function(x) {
return(ifelse(test = x > max.cutoff, yes = max.cutoff,
no = x))
})
data_plot <- data.plot[idx.keep, ]
data_plot$gene <- data.gene
if (length(x = cols.use) == 1) {
brewer.gran <- brewer.pal.info[cols.use, ]$maxcolors
}
else {
brewer.gran <- length(x = cols.use)
}
if (all(data.gene == 0)) {
data.cut <- 0
}
else {
data.cut <- as.numeric(x = as.factor(x = cut(x = as.numeric(x = data.gene),
breaks = brewer.gran)))
}
data_plot$col <- as.factor(x = data.cut)
p <- data_plot %>%
dplyr::arrange(col) %>%
ggplot(mapping = aes(x = x, y = y))
if (brewer.gran != 2) {
if (length(x = cols.use) == 1) {
p <- p + geom_point(mapping = aes(color = col), size = pt.size,
shape = pch.use) + #scale_color_brewer(palette = cols.use)
scale_color_viridis(option = "plasma", end = .9)
}
else {
p <- p + geom_point(mapping = aes(color = col), size = pt.size,
shape = pch.use) + #scale_color_manual(values = cols.use)
scale_color_viridis(option = "plasma", end = .9)
}
}
else {
if (all(data_plot$gene == data_plot$gene[1])) {
warning(paste0("All cells have the same value of ",
feature, "."))
p <- p + geom_point(color = cols.use[1], size = pt.size,
shape = pch.use)
}
else {
p <- p + geom_point(mapping = aes(color = gene),
size = pt.size, shape = pch.use) + scale_color_viridis(option = "plasma", end = .9
)
}
}
if (dark.theme) {
p <- p + DarkTheme()
}
if (no.axes) {
p <- p + theme(axis.line = element_blank(), axis.text.x = element_blank(),
axis.text.y = element_blank(), axis.ticks = element_blank(),
axis.title.x = element_blank(), axis.title.y = element_blank())
if (!no.title)
p <- p + labs(title = feature, x = "", y = "")
if (no.title)
p <- p + labs(x = "", y = "")
}
else {
if (no.title)
p <- p + labs(x = dim.codes[1], y = dim.codes[2])
if (!(no.title))
p <- p + labs(title = feature) + labs(x = "", y = "")
}
if (no.legend) {
p <- p + theme(legend.position = "none")
}
if (coord.fixed) {
p <- p + coord_fixed()
}
return(p)
}
environment(SingleFeaturePlotSeurat) <- asNamespace("Seurat")
PlotFeatures <- function(umap_use, features_plot, exprs_use, cells_use, ncols, pt_size = .5, pt_shape = ".", q_lo = "q10", q_hi = "q90") {
if (missing(cells_use)) cells_use <- 1:nrow(umap_use)
if (missing(ncols)) ncols <- round(sqrt(length(features_plot)))
plt_list <- lapply(features_plot, function(feature_use) {
SingleFeaturePlotSeurat(exprs_use[, cells_use], feature_use, data.frame(x = umap_use[cells_use, 1], y = umap_use[cells_use, 2]),
pt.size = pt_size, pch.use = pt_shape, cols.use = c("lightgrey", "blue"),
dim.codes = c("UMAP 1", "UMAP 2"), min.cutoff = c(q10 = q_lo), max.cutoff = c(q90 = q_hi),
coord.fixed = FALSE, no.axes = FALSE, dark.theme = FALSE, no.legend = TRUE)
})
plot_grid(plotlist = plt_list, ncol = ncols)
#return(plt_list)
}
BuildSNNSeurat <- function (data.use, k.param = 30, prune.SNN = 1/15, nn.eps = 0) {
my.knn <- nn2(data = data.use, k = k.param, searchtype = "standard", eps = nn.eps)
nn.ranked <- my.knn$nn.idx
snn_res <- ComputeSNN(nn_ranked = nn.ranked, prune = prune.SNN)
rownames(snn_res) <- row.names(data.use)
colnames(snn_res) <- row.names(data.use)
return(snn_res)
}
environment(BuildSNNSeurat) <- asNamespace("Seurat")
NormalizeDataSeurat <- function(A, scaling_factor = 1e4, do_ftt = FALSE) {
A@x <- A@x / rep.int(Matrix::colSums(A), diff(A@p))
A@x <- scaling_factor * A@x
if (do_ftt) {
A@x <- sqrt(A@x) + sqrt(1 + A@x)
} else {
A@x <- log(1 + A@x)
}
return(A)
}
plot_clusters <- function(cluster_ids, labels, pt_size = 14, umap_use = umap_res, collapse_labels = FALSE) {
cluster_table <- table(cluster_ids)
clusters_keep <- names(which(cluster_table > 20))
plt_df <- umap_use %>% data.frame() %>% cbind(cluster = cluster_ids) %>%
subset(cluster %in% clusters_keep)
if (!missing(labels)) {
if (collapse_labels) {
plt_df$cluster <- labels[plt_df$cluster + 1]
} else {
plt_df$cluster <- sprintf("c%d: %s", plt_df$cluster, labels[plt_df$cluster + 1])
}
}
plt_df %>%
ggplot(aes(X1, X2, col = factor(cluster))) + geom_point(shape = '.', size = 6, alpha = .7) +
geom_label_repel(data = data.table(plt_df)[, .(X1 = mean(X1), X2 = mean(X2)), by = cluster],
aes(label = cluster), size = pt_size, alpha = .8) +
theme_void() +
theme(axis.line = element_line()) +
guides(col = FALSE)
}
# ls /data/srlab1/public/srcollab/AMP_Phase_2/cellranger-3.1.0/GRCh38/ > filenames.txt
filename <- read.table("./filenames.txt", header = FALSE, stringsAsFactors = FALSE)
filename <- as.character(t(filename))
# filename
i = 1
dat <- Seurat::Read10X(paste("/data/srlab1/public/srcollab/AMP_Phase_2/cellranger-3.1.0/GRCh38/", filename[i], "/outs/filtered_feature_bc_matrix/", sep=""))
adt_exprs <- dat$`Antibody Capture` %>% as("dgCMatrix")
mRNA_exprs <- dat$`Gene Expression` %>% as("dgCMatrix")
colnames(mRNA_exprs) <- paste0(filename[i], "_", colnames(mRNA_exprs), sep="")
colnames(adt_exprs) <- paste0(filename[i], "_", colnames(adt_exprs), sep="")
meta_all <- data.frame(
cell = colnames(mRNA_exprs),
sample = rep(filename[i], ncol(mRNA_exprs))
)
myplots <- list()
for (i in 2:length(filename)) {
print(i)
dat <- Seurat::Read10X(paste("/data/srlab1/public/srcollab/AMP_Phase_2/cellranger-3.1.0/GRCh38/", filename[i], "/outs/filtered_feature_bc_matrix/", sep=""))
adt_temp <- dat$`Antibody Capture` %>% as("dgCMatrix")
mRNA_temp <- dat$`Gene Expression` %>% as("dgCMatrix")
colnames(mRNA_temp) <- paste0(filename[i], "_", colnames(mRNA_temp), sep="")
colnames(adt_temp) <- paste0(filename[i], "_", colnames(adt_temp), sep="")
meta_temp <- data.frame(
cell = colnames(mRNA_temp),
sample = rep(filename[i], ncol(mRNA_temp))
)
adt_exprs <- cbind(adt_exprs, adt_temp)
mRNA_exprs <- cbind(mRNA_exprs, mRNA_temp)
meta_all <- rbind(meta_all, meta_temp)
}
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 2
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 3
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 4
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 5
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 6
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 7
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 8
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 9
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 10
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 11
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 12
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 13
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 14
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 15
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 16
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 17
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 18
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 19
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 20
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 21
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 22
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 23
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 24
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 25
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 26
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 27
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 28
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 29
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 30
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 31
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 32
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 33
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 34
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 35
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 36
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 37
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 38
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 39
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 40
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 41
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 42
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 43
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 44
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 45
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 46
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 47
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 48
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 49
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 50
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 51
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 52
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 53
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 54
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 55
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 56
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 57
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 58
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 59
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 60
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 61
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 62
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 63
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 64
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 65
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 66
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 67
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 68
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 69
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 70
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 71
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 72
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 73
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 74
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 75
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 76
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 77
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 78
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 79
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 80
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 81
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 82
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 83
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 84
10X data contains more than one type and is being returned as a list containing matrices of each type.
[1] 85
10X data contains more than one type and is being returned as a list containing matrices of each type.
dim(adt_exprs)
dim(mRNA_exprs)
table(meta_all$sample)
all(meta_all$cell == colnames(mRNA_exprs))
all(meta_all$cell == colnames(adt_exprs))
# saveRDS(adt_exprs, "adt_exprs_2020-01-24.rds")
# saveRDS(mRNA_exprs, "mRNA_exprs_2020-01-24.rds")
# saveRDS(meta_all, "meta_all_2020-01-24.rds")
BRI-399 BRI-401 BRI-403 BRI-405 BRI-407 BRI-409 BRI-411 BRI-413
11905 8540 6597 1613 4574 4544 2948 5772
BRI-415 BRI-417 BRI-419 BRI-421 BRI-425 BRI-427 BRI-429 BRI-431
10504 3156 7458 15542 1558 15693 1407 1387
BRI-436 BRI-438 BRI-440 BRI-442 BRI-456 BRI-458 BRI-460 BRI-462
7717 4435 1687 5427 6262 14370 16023 17199
BRI-475 BRI-477 BRI-479 BRI-481 BRI-483 BRI-485 BRI-487 BRI-503
3402 3167 13921 1841 845 9022 13974 365
BRI-505 BRI-507 BRI-509 BRI-511 BRI-515 BRI-517 BRI-521 BRI-523
11423 13242 389 13242 11104 21102 2347 17866
BRI-525 BRI-527 BRI-529 BRI-534 BRI-536 BRI-538 BRI-540 BRI-542
19447 669 9606 1395 970 8118 17036 9030
BRI-544 BRI-546 BRI-548 BRI-550 BRI-552 BRI-554 BRI-556 BRI-558
649 3735 3393 8338 5485 817 12924 4580
BRI-560 BRI-562 BRI-564 BRI-566 BRI-570 BRI-581 BRI-583 BRI-585
3538 1564 819 13478 1995 394 14599 6466
BRI-587 BRI-589 BRI-601 BRI-603 BRI-605 BRI-607 BRI-609 BRI-611
13412 825 14434 583 14348 10463 1898 1292
BRI-613 BRI-615 BRI-617 BRI-619 BRI-621 BRI-623 BRI-625 BRI-627
4285 15378 3687 7356 516 9577 3334 1748
BRI-629 BRI-631 BRI-633R BRI-635 BRI-637R
1546 1252 396 6430 1225
adt_exprs <- readRDS("adt_exprs_2020-01-24.rds")
mRNA_exprs <- readRDS("mRNA_exprs_2020-01-24.rds")
meta_all <- readRDS("meta_all_2020-01-24.rds")
meta_all <- readRDS("qc_mRNA_protein_pca_umap_clus_meta_all_site_anno_hpca_2020-03-31.rds")
dim(meta_all)
mRNA_exprs <- mRNA_exprs[, meta_all$cell]
dim(mRNA_exprs)
saveRDS(mRNA_exprs, "raw_mRNA_exprs_afterQC_2020-04-21.rds")
# Filter out cells with <= 500 genes, >= 0.2 mitochondrial genes
meta_all$nUMI <- Matrix::colSums(mRNA_exprs)
meta_all$nGene <- Matrix::colSums(mRNA_exprs > 0)
mito_genes <- grep("^MT-", rownames(mRNA_exprs), value = TRUE, ignore.case = TRUE)
meta_all$percent_mito <- Matrix::colSums(mRNA_exprs[mito_genes, ])/Matrix::colSums(mRNA_exprs)
length(mito_genes)
fig.size(3,12)
(ggplot(meta_all) +
geom_histogram(aes(x = nUMI))) +
theme_bw(base_size = 20) +
theme(text = element_text(size = 20),
legend.position = "none",
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(angle = 60, hjust = 1)
)+
(ggplot(meta_all) +
geom_histogram(aes(x = nGene)) +
geom_vline(xintercept = 500, col = "darkred")) +
theme_bw(base_size = 20) +
theme(text = element_text(size = 20),
legend.position = "none",
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(angle = 60, hjust = 1)
) +
(ggplot(meta_all) +
geom_histogram(aes(x = percent_mito)) +
geom_vline(xintercept = 0.2, col = "darkred")) +
theme_bw(base_size = 20) +
theme(text = element_text(size = 20),
legend.position = "none",
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(angle = 60, hjust = 1)
)
# ggsave(file = paste("post_QC_barplot", ".pdf", sep = ""), width = 15, height = 4, dpi = 300)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`. `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
meta_all <- readRDS("meta_all_2020-01-24.rds")
library(MASS)
get_density <- function(x, y, ...) {
dens <- MASS::kde2d(x, y, ...)
ix <- findInterval(x, dens$x)
iy <- findInterval(y, dens$y)
ii <- cbind(ix,iy)
return(dens$z[ii])
}
plot_dens <- get_density(meta_all$percent_mito, meta_all$nGene, n = 100)
fig.size(4,6)
ggplot(meta_all, aes(x = percent_mito, y = nGene, color = plot_dens)) +
geom_point(size = .3) +
scale_x_continuous(trans = "log2") +
scale_y_continuous(trans = "log2") +
geom_hline(yintercept = 500, color = "darkred") +
geom_vline(xintercept = .2, color = "darkred") +
scale_color_viridis() +
labs(color = "density") +
theme_classic(base_size = 20) +
theme(
# legend.position = "none",
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=22, face = "italic")
)
Warning message: “Transformation introduced infinite values in continuous x-axis”
# Plot per sample quality
temp <- as.data.frame(table(meta_all$sample))
sample_order <- temp[order(temp$Freq, decreasing = T),]
s10000 <- as.character(sample_order[which(sample_order$Freq > 10000),]$Var1)
s1000 <-as.character(sample_order[which(sample_order$Freq > 1000 & sample_order$Freq < 10001),]$Var1)
s100 <- as.character(sample_order[which(sample_order$Freq < 1001),]$Var1)
meta_all$sample <- as.character(meta_all$sample)
meta_all$cell <- as.character(meta_all$cell)
meta_all[1:4,]
# meta_all$sample <- factor(meta_all$sample, levels = sample_order)
| cell | sample | nUMI | nGene | percent_mito | |
|---|---|---|---|---|---|
| <chr> | <chr> | <dbl> | <int> | <dbl> | |
| 1 | BRI-399_AAACCCAAGTCATTGC | BRI-399 | 7959 | 2497 | 0.15165222 |
| 2 | BRI-399_AAACCCAAGTGAATAC | BRI-399 | 8601 | 3055 | 0.06522497 |
| 3 | BRI-399_AAACCCACAGTAGTTC | BRI-399 | 6012 | 1713 | 0.32218896 |
| 4 | BRI-399_AAACCCACAGTCCGTG | BRI-399 | 13897 | 3833 | 0.07922573 |
library(MASS)
get_density <- function(x, y, ...) {
dens <- MASS::kde2d(x, y, ...)
ix <- findInterval(x, dens$x)
iy <- findInterval(y, dens$y)
ii <- cbind(ix,iy)
return(dens$z[ii])
}
test <- meta_all[which(meta_all$sample %in% s10000),]
plot_dens <- get_density(test$percent_mito, test$nGene, n = 100)
fig.size(10,45)
ggplot(test,
aes(x = percent_mito, y = nGene, color = plot_dens)
) +
geom_point(size = .3) +
geom_hline(yintercept = 500, color = "darkred") +
geom_vline(xintercept = .2, color = "darkred") +
facet_wrap(sample ~ . , ncol = 10) +
scale_color_viridis() +
labs(color = "density") +
theme_classic(base_size = 35) +
theme(
# legend.position = "none",
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=22, face = "italic")
)
# QC
meta_all <- subset(meta_all, nGene > 500 & percent_mito < 0.2)
dim(meta_all)
table(meta_all$sample)
mRNA_exprs <- mRNA_exprs[, meta_all$cell]
fig.size(3,12)
(ggplot(meta_all) +
geom_histogram(aes(x = nUMI))) +
theme_bw(base_size = 20) +
theme(text = element_text(size = 20),
legend.position = "none",
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(angle = 60, hjust = 1)
)+
(ggplot(meta_all) +
geom_histogram(aes(x = nGene)) +
geom_vline(xintercept = 500, col = "darkred")) +
theme_bw(base_size = 20) +
theme(text = element_text(size = 20),
legend.position = "none",
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(angle = 60, hjust = 1)
) +
(ggplot(meta_all) +
geom_histogram(aes(x = percent_mito)) +
geom_vline(xintercept = 0.2, col = "darkred")) +
theme_bw(base_size = 20) +
theme(text = element_text(size = 20),
legend.position = "none",
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(angle = 60, hjust = 1)
)
BRI-399 BRI-401 BRI-403 BRI-405 BRI-407 BRI-409 BRI-411 BRI-413
8146 5610 4505 1085 2955 3354 2469 4229
BRI-415 BRI-417 BRI-419 BRI-421 BRI-425 BRI-427 BRI-429 BRI-431
8719 2372 4970 11950 1021 10182 982 1091
BRI-436 BRI-438 BRI-440 BRI-442 BRI-456 BRI-458 BRI-460 BRI-462
6661 2700 1386 3271 4644 11569 12618 13342
BRI-475 BRI-477 BRI-479 BRI-481 BRI-483 BRI-485 BRI-487 BRI-503
2510 2169 10603 1453 665 6037 9339 316
BRI-505 BRI-507 BRI-509 BRI-511 BRI-515 BRI-517 BRI-521 BRI-523
7857 8132 319 8513 9572 13932 714 5903
BRI-525 BRI-527 BRI-529 BRI-534 BRI-536 BRI-538 BRI-540 BRI-542
10196 380 3635 870 832 5257 9806 6159
BRI-544 BRI-546 BRI-548 BRI-550 BRI-552 BRI-554 BRI-556 BRI-558
348 2990 2620 6139 4190 512 10049 3820
BRI-560 BRI-562 BRI-564 BRI-566 BRI-570 BRI-581 BRI-583 BRI-585
2551 1053 601 11328 1510 284 10676 4564
BRI-587 BRI-589 BRI-601 BRI-603 BRI-605 BRI-607 BRI-609 BRI-611
10537 652 9852 354 10564 7718 1258 1057
BRI-613 BRI-615 BRI-617 BRI-619 BRI-621 BRI-623 BRI-625 BRI-627
3182 12194 3123 5285 304 7702 2624 1197
BRI-629 BRI-631 BRI-633R BRI-635 BRI-637R
1017 1004 215 4888 634
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`. `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# saveRDS(mRNA_exprs, "qc_mRNA_exprs_2020-01-24.rds")
# saveRDS(meta_all, "qc_meta_all_2020-01-24.rds")
mRNA_exprs <- readRDS("qc_mRNA_exprs_2020-01-24.rds")
meta_all <- readRDS("qc_meta_all_2020-01-24.rds")
library(MASS)
get_density <- function(x, y, ...) {
dens <- MASS::kde2d(x, y, ...)
ix <- findInterval(x, dens$x)
iy <- findInterval(y, dens$y)
ii <- cbind(ix,iy)
return(dens$z[ii])
}
plot_dens <- get_density(meta_all$percent_mito, meta_all$nGene, n = 100)
fig.size(4,6)
ggplot(meta_all, aes(x = percent_mito, y = nGene, color = plot_dens)) +
geom_point(size = .3) +
geom_hline(yintercept = 500, color = "darkred") +
geom_vline(xintercept = .2, color = "darkred") +
scale_color_viridis() +
labs(color = "density") +
# coord_cartesian(xlim = c(0, 1), ylim = c(0,12000)) +
theme_classic(base_size = 20) +
theme(
# legend.position = "none",
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=22, face = "italic")
)
temp <- as.data.frame(table(meta_all$sample))
sample_order <- temp[order(temp$Freq, decreasing = T),]
sample_order[1:4,]
s10000 <- as.character(sample_order[which(sample_order$Freq > 10000),]$Var1)
s5000 <-as.character(sample_order[which(sample_order$Freq > 5000 & sample_order$Freq < 10001),]$Var1)
s1000 <-as.character(sample_order[which(sample_order$Freq > 1000 & sample_order$Freq < 5001),]$Var1)
s100 <- as.character(sample_order[which(sample_order$Freq < 1001),]$Var1)
| Var1 | Freq | |
|---|---|---|
| <fct> | <int> | |
| 38 | BRI-517 | 13932 |
| 24 | BRI-462 | 13342 |
| 23 | BRI-460 | 12618 |
| 74 | BRI-615 | 12194 |
test <- meta_all[which(meta_all$sample %in% s10000),]
plot_dens <- get_density(test$percent_mito, test$nGene, n = 100)
fig.size(7,45)
ggplot(test,
aes(x = percent_mito, y = nGene, color = plot_dens)
) +
geom_point(size = .3) +
geom_hline(yintercept = 500, color = "darkred") +
geom_vline(xintercept = .2, color = "darkred") +
facet_wrap(sample ~ . , ncol = 10) +
scale_color_viridis() +
labs(color = "density") +
theme_classic(base_size = 28) +
theme(
# legend.position = "none",
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=22, face = "italic")
)
temp <- as.data.frame(table(meta_all$sample))
sample_order <- temp[order(temp$Freq, decreasing = T),]
# Barplot of cell number
options(repr.plot.height = 7, repr.plot.width = 15)
ggplot(data=sample_order, aes(x=reorder(Var1, -Freq), y=Freq)) +
geom_bar(stat="identity", width = 0.7) +
labs(x = "Sample", y = "Number of cells pass QC") +
theme_classic(base_size = 40) +
theme(
legend.position = "none",
axis.text.x = element_blank(),
# axis.ticks = element_blank(),
# axis.text.x = element_text(color = "black", angle = 45, hjust = 1, size = 20),
axis.text.y = element_text(color = "black", size = 30),
panel.grid = element_blank()
)
## Samples with lower cell number post-QC are the ones with lower quality
meta_all_before <- readRDS("meta_all_2020-01-24.rds")
meta_all_after <- readRDS("qc_pca_harmony_umap_meta_all_2020-01-25.rds")
# Percent of cell
before <- as.data.frame(table(meta_all_before$sample))
after <- as.data.frame(table(meta_all_after$sample))
all(before$Var1 == after$Var1)
qc = data.frame(
sample = as.character(before$Var1),
before_qc = before$Freq,
after_qc = after$Freq
)
qc$percent <- qc$after_qc / qc$before_qc
qc$p_badcells = (qc$before_qc - qc$after_qc)/qc$before_qc
qc[order(qc$percent, decreasing = F),][1:4,]
qc[order(qc$p_badcells, decreasing = T),][1:4,]
options(repr.plot.height = 5, repr.plot.width = 7)
ggplot(data=qc,
aes(x=before_qc, y=percent)
) +
geom_point(size = 2 ) +
geom_hline(yintercept = .4, color = "darkred") +
coord_cartesian(ylim = c(0, 1)) +
# geom_bar(stat="identity", width = 0.7) +
labs(x = "Cell number before QC", y = "Percent of cells\n pass QC") +
theme_classic(base_size = 25) +
theme(
legend.position = "none",
# axis.text = element_blank(),
# axis.ticks = element_blank(),
axis.text.x = element_text(color = "black", angle = 45, hjust = 1, size = 25),
axis.text.y = element_text(color = "black", size = 25),
panel.grid = element_blank()
)
ggsave(file = paste("sample_qc_1", ".pdf", sep = ""), width = 7, height = 5, dpi = 300)
ggplot(data=qc,
aes(x=p_badcells, y=after_qc)
) +
geom_point() +
# geom_hline(yintercept = .7, color = "darkred") +
# geom_bar(stat="identity", width = 0.7) +
labs(x = "Percent of low quality cells", y = "Cell number pass QC") +
theme_classic(base_size = 25) +
theme(
legend.position = "none",
# axis.text = element_blank(),
# axis.ticks = element_blank(),
axis.text.x = element_text(color = "black", angle = 45, hjust = 1, size = 25),
axis.text.y = element_text(color = "black", size = 25),
panel.grid = element_blank()
)
| sample | before_qc | after_qc | percent | p_badcells | |
|---|---|---|---|---|---|
| 39 | BRI-521 | 2347 | 714 | 0.3042182 | 0.6957818 |
| 40 | BRI-523 | 17866 | 5903 | 0.3304041 | 0.6695959 |
| 43 | BRI-529 | 9606 | 3635 | 0.3784093 | 0.6215907 |
| 85 | BRI-637R | 1225 | 634 | 0.5175510 | 0.4824490 |
| sample | before_qc | after_qc | percent | p_badcells | |
|---|---|---|---|---|---|
| 39 | BRI-521 | 2347 | 714 | 0.3042182 | 0.6957818 |
| 40 | BRI-523 | 17866 | 5903 | 0.3304041 | 0.6695959 |
| 43 | BRI-529 | 9606 | 3635 | 0.3784093 | 0.6215907 |
| 85 | BRI-637R | 1225 | 634 | 0.5175510 | 0.4824490 |
## Samples with lower cell number post-QC are the ones with lower quality
# meta_all_before <- readRDS("meta_all_2020-01-24.rds")
# meta_all_after <- readRDS("qc_pca_harmony_umap_meta_all_2020-01-25.rds")
temp1 <- meta_all_after %>%
group_by(sample) %>%
summarize(mean_perc_mito = mean(percent_mito, na.rm = TRUE),
mean_perc_nGene = mean(nGene, na.rm = TRUE)
# n_cell = n(),
# n_low_per_mito = sum(percent_mito < 0.2),
# p_low_per_mito = n_low_per_mito / n_cell
)
all(temp1$sample == qc$sample)
temp1 <- cbind(qc, temp1)
temp1 <- temp1[, -5]
# Plot "percent of cells pass QC" vs "average % MT genes for that sample"
options(repr.plot.height = 5, repr.plot.width = 7)
ggplot(data=temp1,
aes(x=100 * mean_perc_mito, y=percent)
) +
geom_point( size = 2 ) +
geom_hline(yintercept = .4, color = "darkred") +
# geom_bar(stat="identity", width = 0.7) +
labs(x = "Mean % MT after QC", y = "Percent of cells\n pass QC") +
theme_classic(base_size = 25) +
theme(
legend.position = "none",
# axis.text = element_blank(),
# axis.ticks = element_blank(),
axis.text.x = element_text(color = "black", angle = 45, hjust = 1, size = 30),
axis.text.y = element_text(color = "black", size = 30),
panel.grid = element_blank()
)
ggsave(file = paste("sample_qc_2", ".pdf", sep = ""), width = 7, height = 5, dpi = 300)
# ggplot(data=temp1,
# aes(x=p_low_per_mito, y=percent)
# ) +
# geom_point() +
# geom_hline(yintercept = .7, color = "darkred") +
# # geom_bar(stat="identity", width = 0.7) +
# # labs(x = "Mean % MT before QC", y = "Percent of cells\n pass QC") +
# theme_classic(base_size = 30) +
# theme(
# legend.position = "none",
# # axis.text = element_blank(),
# # axis.ticks = element_blank(),
# axis.text.x = element_text(color = "black", angle = 45, hjust = 1, size = 30),
# axis.text.y = element_text(color = "black", size = 30),
# panel.grid = element_blank()
# )
# # ggplot(data=temp1,
# # aes(x=mean_perc_nGene, y=percent)
# # ) +
# # geom_point() +
# # geom_hline(yintercept = .7, color = "darkred") +
# # # geom_bar(stat="identity", width = 0.7) +
# # # labs(x = "Cell number before QC", y = "Percent of cells\n pass QC") +
# # theme_classic(base_size = 30) +
# # theme(
# # legend.position = "none",
# # # axis.text = element_blank(),
# # # axis.ticks = element_blank(),
# # axis.text.x = element_text(color = "black", angle = 45, hjust = 1, size = 30),
# # axis.text.y = element_text(color = "black", size = 30),
# # panel.grid = element_blank()
# # )
fig.size(40,45)
ggplot() +
geom_point(
data = meta_all[sample(nrow(meta_all)),],
mapping = aes_string(x = "nGene", y = "nUMI", fill = "sample"),
size = 0.7, stroke = 0.01, shape = 21
) +
facet_wrap( ~ sample, ncol = 10) +
scale_x_continuous(trans = "log2") +
scale_y_continuous(trans = "log2") +
guides(col = FALSE) +
theme_bw(base_size = 20) +
theme(text = element_text(size = 20),
legend.position = "none",
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(angle = 60, hjust = 1)
)
## Ribosomal reads
genes_ribo <- grep("^RPS\\d+|^RPL\\d+", row.names(mRNA_exprs), value = TRUE)
length(genes_ribo)
meta_all$percent.ribo <- Matrix::colSums(mRNA_exprs[genes_ribo, meta_all$cell]) /
Matrix::colSums(mRNA_exprs[, meta_all$cell])
## Normalize
exprs_norm <- mRNA_exprs %>% NormalizeDataSeurat()
dim(exprs_norm)
saveRDS(exprs_norm, "mRNA_exprs_norm_2020-01-24.rds")
# Find varible genes
genes_exclude <- grep("^MT-|^RPL|^RPS|MALAT1|MIR-", row.names(exprs_norm), value = TRUE)
vargenes_df <- FindVariableGenesBatch(exprs_norm, meta_all, genes_exclude, 0.7e3)
nrow(vargenes_df)
table(vargenes_df$N)
var_genes <- vargenes_df$gene
head(var_genes)
exprs_scaled <- exprs_norm[var_genes, ] %>% ScaleDataSeurat()
exprs_cosine <- exprs_scaled %>% cosine_normalize(2)
# PCA for top 20 PCs
pca_res <- irlba::prcomp_irlba(t(exprs_cosine), 20)
umap_res <- umap(pca_res$x, n_neighbors = 30, metric = "cosine", min_dist = .3)
meta_all$UMAP1 <- umap_res$layout[, 1]
meta_all$UMAP2 <- umap_res$layout[, 2]
meta_all <- cbind(meta_all, pca_res$x)
meta_all[1:4,]
| cell | sample | nUMI | nGene | percent_mito | percent.ribo | UMAP1 | UMAP2 | PC1 | PC2 | ⋯ | PC11 | PC12 | PC13 | PC14 | PC15 | PC16 | PC17 | PC18 | PC19 | PC20 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <chr> | <chr> | <dbl> | <int> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | ⋯ | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
| 1 | BRI-399_AAACCCAAGTCATTGC | BRI-399 | 7959 | 2497 | 0.15165222 | 0.13041839 | 6.6981000 | -5.381405 | -0.3571995 | -0.0390370210 | ⋯ | -0.023782410 | 0.02014607 | -0.002612597 | -0.114616503 | -0.09321071 | 0.016751547 | -0.026468537 | -0.071925509 | -0.013562808 | 0.044665006 |
| 2 | BRI-399_AAACCCAAGTGAATAC | BRI-399 | 8601 | 3055 | 0.06522497 | 0.07836298 | 0.4673366 | -9.940085 | -0.2261091 | -0.0163840411 | ⋯ | -0.057559679 | -0.04534581 | -0.009537267 | -0.008953708 | -0.08859925 | 0.001792422 | 0.013124503 | 0.022926293 | 0.101287338 | -0.414311488 |
| 4 | BRI-399_AAACCCACAGTCCGTG | BRI-399 | 13897 | 3833 | 0.07922573 | 0.14082176 | 6.9872802 | -7.038188 | -0.4850761 | 0.0008679194 | ⋯ | 0.002326752 | -0.01557515 | -0.049214707 | -0.064864437 | -0.03625999 | 0.017270126 | 0.005430456 | -0.107591652 | -0.001496217 | 0.077471782 |
| 5 | BRI-399_AAACCCAGTAGGAGGG | BRI-399 | 33644 | 5515 | 0.08474022 | 0.18550113 | 4.7951625 | 3.897222 | 0.2578570 | 0.3665536689 | ⋯ | -0.037175904 | -0.09064106 | -0.204887165 | 0.129006267 | -0.05470159 | 0.112012283 | 0.006601404 | -0.001725609 | -0.014012232 | 0.006655405 |
# # Save the update meta_all
# saveRDS(meta_all, "qc_pca_umap_meta_all_2020-01-24.rds")
meta_all <- readRDS("qc_pca_umap_meta_all_2020-01-24.rds")
meta_all[1:4,]
| cell | sample | nUMI | nGene | percent_mito | percent.ribo | UMAP1 | UMAP2 | PC1 | PC2 | ⋯ | PC11 | PC12 | PC13 | PC14 | PC15 | PC16 | PC17 | PC18 | PC19 | PC20 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <chr> | <chr> | <dbl> | <int> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | ⋯ | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
| 1 | BRI-399_AAACCCAAGTCATTGC | BRI-399 | 7959 | 2497 | 0.15165222 | 0.13041839 | 6.6981000 | -5.381405 | -0.3571995 | -0.0390370210 | ⋯ | -0.023782410 | 0.02014607 | -0.002612597 | -0.114616503 | -0.09321071 | 0.016751547 | -0.026468537 | -0.071925509 | -0.013562808 | 0.044665006 |
| 2 | BRI-399_AAACCCAAGTGAATAC | BRI-399 | 8601 | 3055 | 0.06522497 | 0.07836298 | 0.4673366 | -9.940085 | -0.2261091 | -0.0163840411 | ⋯ | -0.057559679 | -0.04534581 | -0.009537267 | -0.008953708 | -0.08859925 | 0.001792422 | 0.013124503 | 0.022926293 | 0.101287338 | -0.414311488 |
| 4 | BRI-399_AAACCCACAGTCCGTG | BRI-399 | 13897 | 3833 | 0.07922573 | 0.14082176 | 6.9872802 | -7.038188 | -0.4850761 | 0.0008679194 | ⋯ | 0.002326752 | -0.01557515 | -0.049214707 | -0.064864437 | -0.03625999 | 0.017270126 | 0.005430456 | -0.107591652 | -0.001496217 | 0.077471782 |
| 5 | BRI-399_AAACCCAGTAGGAGGG | BRI-399 | 33644 | 5515 | 0.08474022 | 0.18550113 | 4.7951625 | 3.897222 | 0.2578570 | 0.3665536689 | ⋯ | -0.037175904 | -0.09064106 | -0.204887165 | 0.129006267 | -0.05470159 | 0.112012283 | 0.006601404 | -0.001725609 | -0.014012232 | 0.006655405 |
# Harmonization: sample and tissue
options(repr.plot.height = 3, repr.plot.width = 5)
harmony <- HarmonyMatrix(# pca_res$x, meta_all, do_pca=FALSE,
meta_all[, c(9:28)], meta_all, do_pca=FALSE,
"sample", theta = 2,
# lambda = 1, # Not sure what lambda affects
# tau = 10,
epsilon.cluster = -Inf,
epsilon.harmony = -Inf,
max.iter.cluster = 50,
max.iter.harmony = 30,
plot_convergence = T)
Warning message: “Quick-TRANSfer stage steps exceeded maximum (= 20179800)” Warning message: “Quick-TRANSfer stage steps exceeded maximum (= 20179800)” Warning message: “Quick-TRANSfer stage steps exceeded maximum (= 20179800)” Warning message: “Quick-TRANSfer stage steps exceeded maximum (= 20179800)” Warning message: “Quick-TRANSfer stage steps exceeded maximum (= 20179800)” Warning message: “Quick-TRANSfer stage steps exceeded maximum (= 20179800)” Warning message: “Quick-TRANSfer stage steps exceeded maximum (= 20179800)” Warning message: “Quick-TRANSfer stage steps exceeded maximum (= 20179800)” Warning message: “Quick-TRANSfer stage steps exceeded maximum (= 20179800)” Warning message: “Quick-TRANSfer stage steps exceeded maximum (= 20179800)” Harmony 1/30 Harmony 2/30 Harmony 3/30 Harmony 4/30 Harmony 5/30 Harmony 6/30 Harmony 7/30 Harmony 8/30 Harmony 9/30 Harmony 10/30 Harmony 11/30 Harmony 12/30 Harmony 13/30 Harmony 14/30 Harmony 15/30 Harmony 16/30 Harmony 17/30 Harmony 18/30 Harmony 19/30 Harmony 20/30 Harmony 21/30 Harmony 22/30 Harmony 23/30 Harmony 24/30 Harmony 25/30 Harmony 26/30 Harmony 27/30 Harmony 28/30 Harmony 29/30 Harmony 30/30
colnames(harmony) <- paste0("harmonized_", colnames(harmony), sep="")
meta_all <- cbind(meta_all, harmony)
# Save the update meta_all
saveRDS(meta_all, "qc_pca_harmony_meta_all_2020-01-24.rds")
# UMAP with top 20 PCs
umap_res <- umap(harmony[, c(1:20)], n_neighbors = 30, metric = "cosine", min_dist = .1)
meta_all$harmonized_UMAP1 <- umap_res$layout[, 1]
meta_all$harmonized_UMAP2 <- umap_res$layout[, 2]
meta_all[1:4,]
| cell | sample | nUMI | nGene | percent_mito | percent.ribo | UMAP1 | UMAP2 | PC1 | PC2 | ⋯ | harmonized_PC13 | harmonized_PC14 | harmonized_PC15 | harmonized_PC16 | harmonized_PC17 | harmonized_PC18 | harmonized_PC19 | harmonized_PC20 | harmonized_UMAP1 | harmonized_UMAP2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <chr> | <chr> | <dbl> | <int> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | ⋯ | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
| 1 | BRI-399_AAACCCAAGTCATTGC | BRI-399 | 7959 | 2497 | 0.15165222 | 0.13041839 | 6.6981000 | -5.381405 | -0.3571995 | -0.0390370210 | ⋯ | -0.002850743 | -0.077324210 | -0.056753212 | 0.0252208036 | -0.042332082 | -0.01766654 | -0.0004747563 | -0.005633132 | 11.135468 | -0.7312793 |
| 2 | BRI-399_AAACCCAAGTGAATAC | BRI-399 | 8601 | 3055 | 0.06522497 | 0.07836298 | 0.4673366 | -9.940085 | -0.2261091 | -0.0163840411 | ⋯ | 0.001328416 | -0.003706535 | -0.083106743 | -0.0007857185 | 0.005573716 | 0.02676186 | 0.0956605894 | -0.356151813 | 2.863655 | -10.3395105 |
| 4 | BRI-399_AAACCCACAGTCCGTG | BRI-399 | 13897 | 3833 | 0.07922573 | 0.14082176 | 6.9872802 | -7.038188 | -0.4850761 | 0.0008679194 | ⋯ | -0.049297706 | -0.031875093 | -0.003280377 | 0.0278151578 | -0.012838638 | -0.05469046 | 0.0117411423 | 0.027152254 | 10.381248 | -1.9383886 |
| 5 | BRI-399_AAACCCAGTAGGAGGG | BRI-399 | 33644 | 5515 | 0.08474022 | 0.18550113 | 4.7951625 | 3.897222 | 0.2578570 | 0.3665536689 | ⋯ | -0.134523139 | 0.086668640 | -0.041117325 | 0.0601768633 | -0.008440295 | -0.02368517 | -0.0036981598 | -0.004332571 | 1.240208 | 2.9091845 |
# # Save the update meta_all
# saveRDS(meta_all, "qc_pca_harmony_umap_meta_all_2020-01-25.rds")
meta_all <- readRDS("qc_pca_harmony_umap_meta_all_2020-01-25.rds")
library(parallel)
# Louvain
snn_pcs <- BuildSNNSeurat(meta_all[, c(29:48)], nn.eps = .5)
resolution_list <- c(0.2, 0.4, 0.6, 0.8, 1.0)
ids_cos <- Reduce(cbind, mclapply(resolution_list, function(res_use) {
Seurat:::RunModularityClustering(SNN = snn_pcs, modularity = 1,
resolution = res_use, algorithm = 3, n.start = 10,
n.iter = 10, random.seed = 0, print.output = FALSE,
temp.file.location = NULL, edge.file.name = NULL)
}, mc.cores = min(16, length(resolution_list))))
ids_cos %<>% data.frame()
colnames(ids_cos) <- sprintf("res_%.2f", resolution_list)
meta_all$res_0.20 <- ids_cos$res_0.20
meta_all$res_0.40 <- ids_cos$res_0.40
meta_all$res_0.60 <- ids_cos$res_0.60
meta_all$res_0.80 <- ids_cos$res_0.80
meta_all$res_1.00 <- ids_cos$res_1.00
# saveRDS(meta_all, "qc_pca_harmony_umap_clus_meta_all_2020-02-18.rds")
meta_all <- readRDS("qc_pca_harmony_umap_clus_meta_all_2020-02-18.rds")
options(repr.plot.height = 8, repr.plot.width = 10)
ggplot() +
geom_point(
data = meta_all[sample(nrow(meta_all)),],
mapping = aes_string(x = "harmonized_UMAP1 ", y = "harmonized_UMAP2", fill = "sample"),
size = 0.2, stroke = 0.0001, shape = 21
) +
# scale_fill_manual(values = meta_colors$sample, name = "") +
labs(
x = "UMAP1",
y = "UMAP2",
title = paste0("Clustering of ", nrow(meta_all), " cells; \nColored by ", nrow(as.data.frame(table(meta_all$sample))), " samples")
) +
theme_bw(base_size = 30) +
theme(
legend.position = "none",
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30)
)
clin_meta <- readRDS("clin_meta_2020-01-26.rds")
Cedars <- table(clin_meta$site, clin_meta$mRNA_run)[1,]
Cedars <- names(Cedars[which(Cedars == 1)])
Columbia <- table(clin_meta$site, clin_meta$mRNA_run)[2,]
Columbia <- names(Columbia[which(Columbia == 1)])
Feinstein <- table(clin_meta$site, clin_meta$mRNA_run)[3,]
Feinstein <- names(Feinstein[which(Feinstein == 1)])
length(Feinstein)
HSS <- table(clin_meta$site, clin_meta$mRNA_run)[4,]
HSS <- names(HSS[which(HSS == 1)])
Northwestern <- table(clin_meta$site, clin_meta$mRNA_run)[5,]
Northwestern <- names(Northwestern[which(Northwestern == 1)])
UAB <- table(clin_meta$site, clin_meta$mRNA_run)[6,]
UAB <- names(UAB[which(UAB == 1)])
length(UAB)
UC_San_Diego <- table(clin_meta$site, clin_meta$mRNA_run)[7,]
UC_San_Diego <- names(UC_San_Diego[which(UC_San_Diego == 1)])
UK_Birmingham <- table(clin_meta$site, clin_meta$mRNA_run)[8,]
UK_Birmingham <- names(UK_Birmingham[which(UK_Birmingham == 1)])
UK_London <- table(clin_meta$site, clin_meta$mRNA_run)[9,]
UK_London <- names(UK_London[which(UK_London == 1)])
Colorado <- table(clin_meta$site, clin_meta$mRNA_run)[10,]
Colorado <- names(Colorado[which(Colorado == 1)])
Pittsburgh <- table(clin_meta$site, clin_meta$mRNA_run)[11,]
Pittsburgh <- names(Pittsburgh[which(Pittsburgh == 1)])
length(Pittsburgh)
Rochester <- table(clin_meta$site, clin_meta$mRNA_run)[12,]
Rochester <- names(Rochester[which(Rochester == 1)])
meta_all$site <- meta_all$sample
meta_all$site[which(meta_all$site %in% Cedars)] <- "Cedars"
meta_all$site[which(meta_all$site %in% Columbia)] <- "Columbia"
meta_all$site[which(meta_all$site %in% Feinstein)] <- "Feinstein"
meta_all$site[which(meta_all$site %in% HSS)] <- "HSS"
meta_all$site[which(meta_all$site %in% Northwestern)] <- "Northwestern"
meta_all$site[which(meta_all$site %in% UAB)] <- "UAB"
meta_all$site[which(meta_all$site %in% UC_San_Diego)] <- "UC_San_Diego"
meta_all$site[which(meta_all$site %in% UK_Birmingham)] <- "UK_Birmingham"
meta_all$site[which(meta_all$site %in% UK_London)] <- "UK_London"
meta_all$site[which(meta_all$site %in% Colorado)] <- "Colorado"
meta_all$site[which(meta_all$site %in% Pittsburgh)] <- "Pittsburgh"
meta_all$site[which(meta_all$site %in% Rochester)] <- "Rochester"
meta_all$site[which(meta_all$site %in% "BRI-548")] <- "Rochester"
meta_all$site[which(meta_all$site %in% "BRI-552")] <- "Feinstein"
meta_all$site[which(meta_all$site %in% "BRI-581")] <- "UAB"
table(meta_all$site)
# saveRDS(meta_all, "qc_pca_harmony_umap_meta_all_site_2020-01-25.rds")
Cedars Colorado Columbia Feinstein HSS
19684 13427 35301 4190 74262
Northwestern Pittsburgh Rochester UAB UC_San_Diego
11490 40966 15984 2904 70837
UK_Birmingham UK_London
86364 28187
options(repr.plot.height = 7, repr.plot.width = 20)
ggplot() +
geom_point(
data = meta_all[sample(nrow(meta_all)),],
mapping = aes_string(x = "harmonized_UMAP1", y = "harmonized_UMAP2", fill = "site"),
size = 0.07, stroke = 0.0001, shape = 21
) +
facet_wrap(site ~ ., ncol = 6) +
# scale_fill_manual(values = meta_colors$sample, name = "") +
labs(
x = "UMAP1",
y = "UMAP2",
title = paste0("Clustering of ", nrow(meta_all), " cells; \nColored by ", nrow(as.data.frame(table(meta_all$site))), " site")
) +
theme_bw(base_size = 30) +
theme(
legend.position = "none",
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30)
)
options(repr.plot.height = 8, repr.plot.width = 10)
ggplot() +
geom_point(
data = meta_all[sample(nrow(meta_all)),],
mapping = aes_string(x = "harmonized_UMAP1", y = "harmonized_UMAP2", fill = "nGene"),
size = 0.1, stroke = 0.0001, shape = 21
) +
scale_fill_gradientn(
colours = colorRampPalette(RColorBrewer::brewer.pal(9, "Reds"))(7)[1:7],
name = "nUMI"
) +
labs(
x = "UMAP1",
y = "UMAP2"
) +
theme_bw(base_size = 30) +
theme(
legend.position = "none",
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30)
)
ggplot() +
geom_point(
data = meta_all[sample(nrow(meta_all)),],
mapping = aes_string(x = "harmonized_UMAP1", y = "harmonized_UMAP2", fill = "percent_mito"),
size = 0.1, stroke = 0.0001, shape = 21
) +
scale_fill_gradientn(
colours = colorRampPalette(RColorBrewer::brewer.pal(9, "Reds"))(7)[1:7],
name = "nUMI"
) +
labs(
x = "UMAP1",
y = "UMAP2"
) +
theme_bw(base_size = 30) +
theme(
legend.position = "none",
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30)
)
# Load the pre-QC mRNA cells
meta_all <- readRDS("qc_pca_harmony_umap_meta_all_2020-01-25.rds")
adt_exprs <- readRDS("adt_exprs_2020-01-24.rds")
adt_exprs <- adt_exprs[, meta_all$cell]
all(colnames(adt_exprs) == meta_all$cell)
dim(adt_exprs)
fig.size(5,7)
hist(rowSums(as.matrix(adt_exprs[, meta_all$sample == "BRI-399"] < 5))/sum(meta_all$sample == "BRI-399"), cex.main=0.5, cex.sub=0.5)
hist(rowSums(as.matrix(adt_exprs[, meta_all$sample == "BRI-415"] < 5))/sum(meta_all$sample == "BRI-415"), cex.main=0.5, cex.sub=0.5)
hist(rowSums(as.matrix(adt_exprs[, meta_all$sample == "BRI-483"] < 5))/sum(meta_all$sample == "BRI-483"), cex.main=0.5, cex.sub=0.5)
# Why normalizeCLR_dgc for protein data?
# adt_exprs_norm <- adt_exprs %>% singlecellmethods::normalizeData(method = "cellCLR")
# saveRDS(adt_exprs_norm, "adt_exprs_norm_2020-01-25.rds")
adt_exprs_norm <- readRDS("adt_exprs_norm_2020-01-25.rds")
dim(adt_exprs_norm)
?normalizeCLR_dgc
# “This is what the protein looks like, no filtering” to show that different markers have different levels of background
plot_protein <- rownames(adt_exprs)[1:20]
myplots <- list()
for (i in 1:length(plot_protein)) {
gene <- plot_protein[i]
max.cutoff = quantile(adt_exprs_norm[gene,], .99)
min.cutoff = quantile(adt_exprs_norm[gene,], .01)
tmp <- sapply(X = adt_exprs_norm[gene,], FUN = function(x) {
return(ifelse(test = x > max.cutoff, yes = max.cutoff,
no = x))
})
tmp <- sapply(X = tmp, FUN = function(x) {
return(ifelse(test = x < min.cutoff, yes = min.cutoff,
no = x))
})
meta_all$gene <- as.numeric(tmp)
ind <- paste("p", i, sep = "")
ind <- ggplot(
data = meta_all[sample(nrow(meta_all)),],
# data = meta_all[order(meta_all$gene),] ,
aes(x = harmonized_UMAP1, y = harmonized_UMAP2)) +
geom_point(mapping = aes(color = gene), size = 0.1) +
scale_color_viridis(option = "plasma", end = .9) +
labs(x="", y="")+
theme_bw(base_size = 15)+
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30, face = "italic"), # face="bold.italic"
legend.position = "none") +
labs(title = gene)
myplots[[i]] <- ind
}
options(repr.plot.height = 15, repr.plot.width = 20)
p <- do.call("grid.arrange", c(myplots, ncol = 5))
# “This is what the protein looks like, no filtering” to show that different markers have different levels of background
plot_protein <- rownames(adt_exprs)[21:40]
myplots <- list()
for (i in 1:length(plot_protein)) {
gene <- plot_protein[i]
max.cutoff = quantile(adt_exprs_norm[gene,], .99)
min.cutoff = quantile(adt_exprs_norm[gene,], .01)
tmp <- sapply(X = adt_exprs_norm[gene,], FUN = function(x) {
return(ifelse(test = x > max.cutoff, yes = max.cutoff,
no = x))
})
tmp <- sapply(X = tmp, FUN = function(x) {
return(ifelse(test = x < min.cutoff, yes = min.cutoff,
no = x))
})
meta_all$gene <- as.numeric(tmp)
ind <- paste("p", i, sep = "")
ind <- ggplot(
data = meta_all[sample(nrow(meta_all)),],
# data = meta_all[order(meta_all$gene),] ,
aes(x = harmonized_UMAP1, y = harmonized_UMAP2)) +
geom_point(mapping = aes(color = gene), size = 0.1) +
scale_color_viridis(option = "plasma", end = .9) +
labs(x="", y="")+
theme_bw(base_size = 15)+
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30, face = "italic"), # face="bold.italic"
legend.position = "none") +
labs(title = gene)
myplots[[i]] <- ind
}
options(repr.plot.height = 15, repr.plot.width = 20)
p <- do.call("grid.arrange", c(myplots, ncol = 5))
# “This is what the protein looks like, no filtering” to show that different markers have different levels of background
plot_protein <- rownames(adt_exprs)[41:nrow(adt_exprs)]
myplots <- list()
for (i in 1:length(plot_protein)) {
gene <- plot_protein[i]
max.cutoff = quantile(adt_exprs_norm[gene,], .99)
min.cutoff = quantile(adt_exprs_norm[gene,], .01)
tmp <- sapply(X = adt_exprs_norm[gene,], FUN = function(x) {
return(ifelse(test = x > max.cutoff, yes = max.cutoff,
no = x))
})
tmp <- sapply(X = tmp, FUN = function(x) {
return(ifelse(test = x < min.cutoff, yes = min.cutoff,
no = x))
})
meta_all$gene <- as.numeric(tmp)
ind <- paste("p", i, sep = "")
ind <- ggplot(
data = meta_all[sample(nrow(meta_all)),],
# data = meta_all[order(meta_all$gene),] ,
aes(x = harmonized_UMAP1, y = harmonized_UMAP2)) +
geom_point(mapping = aes(color = gene), size = 0.1) +
scale_color_viridis(option = "plasma", end = .9) +
labs(x="", y="")+
theme_bw(base_size = 15)+
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30, face = "italic"), # face="bold.italic"
legend.position = "none") +
labs(title = gene)
myplots[[i]] <- ind
}
options(repr.plot.height = 15, repr.plot.width = 20)
p <- do.call("grid.arrange", c(myplots, ncol = 5))
# Histograms of normalized, unfiltered expression for each protein marker
options(repr.plot.height = 24, repr.plot.width = 30)
Reduce(`+`, lapply(which(rowSums(adt_exprs_norm > 0) > 0), function(x) {
ggplot(as.data.frame(adt_exprs_norm[x,]), aes(adt_exprs_norm[x,])) +
geom_histogram(binwidth = max(adt_exprs_norm[x,])/100) +
labs(title=row.names(adt_exprs_norm)[x], x = "CLR-Normalized ADT counts") +
theme_bw(base_size = 15)+
theme(
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
legend.position = "none"
)
}))
# Histograms of normalized, unfiltered expression for each protein marker
options(repr.plot.height = 24, repr.plot.width = 30)
Reduce(`+`, lapply(which(rowSums(adt_exprs_norm > 0) > 0), function(x) {
ggplot(as.data.frame(adt_exprs_norm[x,]), aes(adt_exprs_norm[x,])) +
geom_histogram(binwidth = max(adt_exprs_norm[x,])/100) +
geom_vline(xintercept = shift_factor[x], color = "red") +
labs(title=row.names(adt_exprs_norm)[x], x = "CLR-Normalized ADT counts") +
theme_bw(base_size = 15)+
theme(
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
legend.position = "none"
)
}))
shift_factor <- apply(adt_exprs_norm, 1, function(x) {
## find the highest peak in the density histogram after 0, and multiply by 2 to find the end of the peak
density(x[x > 0])$x[which.max(density(x[x > 0])$y)]*2
})
# manually adjust a couple markers' thresholds
shift_factor["CD31_prot"] <- 3.2
shift_factor["FR-beta_prot"] <- 2.2
shift_factor["HLA-DR_prot"] <- 2.4
shift_factor["CD3_prot"] <- 1
shift_factor["CD34_prot"] <- 1.55
shift_factor["CD45(2D1)_prot"] <- 2.2
shift_factor["CD44_prot"] <- 1.8
shift_factor["CD11b_prot"] <- 1
shift_factor["CD64_prot"] <- 1.1
# I don't know if this is necessary, but because I identified this threshold in normalized expression space, I reversed the normalization to get the equivalent thresholds in raw UMI count space
shift_mat <- (exp(shift_factor) - 1)
shift_mat <- as.matrix(shift_factor, ncol = 1) %*% t(as.matrix(apply(adt_exprs[,meta_all$cell], 2, function(x) {exp(sum(log(x + 1))/length(x))})))
# Subtracted filter thresholds
adt_exprs_norm_filter <- adt_exprs - shift_mat
# Set negatives to 0
adt_exprs_norm_filter[adt_exprs_norm_filter < 0] <- 0
# Re-normalize
adt_exprs_norm_filter <- singlecellmethods::normalizeData(adt_exprs_norm_filter, method = "cellCLR")
# Save the post-QC normalized protein expression
saveRDS(adt_exprs_norm_filter, "adt_exprs_norm_filter_2020-02-18.rds")
# Histograms of normalized, filtered expression for each protein marker
options(repr.plot.height = 24, repr.plot.width = 30)
Reduce(`+`, lapply(which(rowSums(adt_exprs_norm_filter > 0) > 0), function(x) {
ggplot(as.data.frame(adt_exprs_norm_filter[x,]), aes(adt_exprs_norm_filter[x,])) +
geom_histogram(binwidth = max(adt_exprs_norm_filter[x,])/100) +
labs(title=row.names(adt_exprs_norm_filter)[x], x = "CLR-Normalized ADT counts") +
theme_bw(base_size = 15)+
theme(
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
legend.position = "none"
)
}))
# “This is what the protein looks like, no filtering” to show that different markers have different levels of background
plot_protein <- rownames(adt_exprs)[1:20]
myplots <- list()
for (i in 1:length(plot_protein)) {
gene <- plot_protein[i]
max.cutoff = quantile(adt_exprs_norm_filter[gene,], .99)
min.cutoff = quantile(adt_exprs_norm_filter[gene,], .01)
tmp <- sapply(X = adt_exprs_norm_filter[gene,], FUN = function(x) {
return(ifelse(test = x > max.cutoff, yes = max.cutoff,
no = x))
})
tmp <- sapply(X = tmp, FUN = function(x) {
return(ifelse(test = x < min.cutoff, yes = min.cutoff,
no = x))
})
meta_all$gene <- as.numeric(tmp)
ind <- paste("p", i, sep = "")
ind <- ggplot(
data = meta_all[sample(nrow(meta_all)),],
# data = meta_all[order(meta_all$gene),] ,
aes(x = harmonized_UMAP1, y = harmonized_UMAP2)) +
geom_point(mapping = aes(color = gene), size = 0.1) +
scale_color_viridis(option = "plasma", end = .9) +
labs(x="", y="")+
theme_bw(base_size = 15)+
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30, face = "italic"), # face="bold.italic"
legend.position = "none") +
labs(title = gene)
myplots[[i]] <- ind
}
options(repr.plot.height = 15, repr.plot.width = 20)
p <- do.call("grid.arrange", c(myplots, ncol = 5))
# “This is what the protein looks like, no filtering” to show that different markers have different levels of background
plot_protein <- rownames(adt_exprs)[21:40]
myplots <- list()
for (i in 1:length(plot_protein)) {
gene <- plot_protein[i]
max.cutoff = quantile(adt_exprs_norm_filter[gene,], .99)
min.cutoff = quantile(adt_exprs_norm_filter[gene,], .01)
tmp <- sapply(X = adt_exprs_norm_filter[gene,], FUN = function(x) {
return(ifelse(test = x > max.cutoff, yes = max.cutoff,
no = x))
})
tmp <- sapply(X = tmp, FUN = function(x) {
return(ifelse(test = x < min.cutoff, yes = min.cutoff,
no = x))
})
meta_all$gene <- as.numeric(tmp)
ind <- paste("p", i, sep = "")
ind <- ggplot(
data = meta_all[sample(nrow(meta_all)),],
# data = meta_all[order(meta_all$gene),] ,
aes(x = harmonized_UMAP1, y = harmonized_UMAP2)) +
geom_point(mapping = aes(color = gene), size = 0.1) +
scale_color_viridis(option = "plasma", end = .9) +
labs(x="", y="")+
theme_bw(base_size = 15)+
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30, face = "italic"), # face="bold.italic"
legend.position = "none") +
labs(title = gene)
myplots[[i]] <- ind
}
options(repr.plot.height = 15, repr.plot.width = 20)
p <- do.call("grid.arrange", c(myplots, ncol = 5))
# “This is what the protein looks like, no filtering” to show that different markers have different levels of background
plot_protein <- rownames(adt_exprs)[41:nrow(adt_exprs)]
myplots <- list()
for (i in 1:length(plot_protein)) {
gene <- plot_protein[i]
max.cutoff = quantile(adt_exprs_norm_filter[gene,], .99)
min.cutoff = quantile(adt_exprs_norm_filter[gene,], .01)
tmp <- sapply(X = adt_exprs_norm_filter[gene,], FUN = function(x) {
return(ifelse(test = x > max.cutoff, yes = max.cutoff,
no = x))
})
tmp <- sapply(X = tmp, FUN = function(x) {
return(ifelse(test = x < min.cutoff, yes = min.cutoff,
no = x))
})
meta_all$gene <- as.numeric(tmp)
ind <- paste("p", i, sep = "")
ind <- ggplot(
data = meta_all[sample(nrow(meta_all)),],
# data = meta_all[order(meta_all$gene),] ,
aes(x = harmonized_UMAP1, y = harmonized_UMAP2)) +
geom_point(mapping = aes(color = gene), size = 0.1) +
scale_color_viridis(option = "plasma", end = .9) +
labs(x="", y="")+
theme_bw(base_size = 15)+
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30, face = "italic"), # face="bold.italic"
legend.position = "none") +
labs(title = gene)
myplots[[i]] <- ind
}
options(repr.plot.height = 15, repr.plot.width = 20)
p <- do.call("grid.arrange", c(myplots, ncol = 5))
# K-L divergence (with modification to prevent lowly expressed proteins from getting high KL-divergences)
# For each marker, comparing distributions:
# Test: distribution of cells across clusters with marker expression > 85th percentile of expression for that marker
# Null: Distribution of cells across clusters
library(entropy)
kl_div_cor_adt_mrna_nofilter <- sapply(1:nrow(adt_exprs_norm), function(ab_idx) {
tmp <- factor(meta_all$res_0.80[adt_exprs_norm[ab_idx,] > quantile(adt_exprs_norm[ab_idx,], .85)])
tmp <- factor(tmp, levels = c(levels(tmp), which(!names(table(meta_all$res_0.80)) %in% levels(tmp))-1))
tmp.table <- table(tmp)[order(names(table(tmp)))]
tst <- tmp.table/table(meta_all$res_0.80)[order(names(table(meta_all$res_0.80)))]
tmp.table <- tmp.table + min(tst[tst > 0])*table(meta_all$res_0.80)[order(names(table(meta_all$res_0.80)))]
freq1 <- tmp.table/sum(tmp.table)
null.table <- table(meta_all$res_0.80)/sum(table(meta_all$res_0.80))
freq2 <- null.table[order(names(null.table))]
KL.plugin(freq1, freq2)
})
names(kl_div_cor_adt_mrna_nofilter) <- row.names(adt_exprs_norm)
library(entropy)
kl_div_cor_adt_mrna_filter <- sapply(1:nrow(adt_exprs_norm_filter), function(ab_idx) {
tmp <- factor(meta_all$res_0.80[adt_exprs_norm_filter[ab_idx,] > quantile(adt_exprs_norm_filter[ab_idx,], .85)])
tmp <- factor(tmp, levels = c(levels(tmp), which(!names(table(meta_all$res_0.80)) %in% levels(tmp))-1))
tmp.table <- table(tmp)[order(names(table(tmp)))]
tst <- tmp.table/table(meta_all$res_0.80)[order(names(table(meta_all$res_0.80)))]
tmp.table <- tmp.table + min(tst[tst > 0])*table(meta_all$res_0.80)[order(names(table(meta_all$res_0.80)))]
freq1 <- tmp.table/sum(tmp.table)
null.table <- table(meta_all$res_0.80)/sum(table(meta_all$res_0.80))
freq2 <- null.table[order(names(null.table))]
KL.plugin(freq1, freq2)
})
names(kl_div_cor_adt_mrna_filter) <- row.names(adt_exprs_norm_filter)
dat_kl = data.frame(
before = kl_div_cor_adt_mrna_nofilter,
after = kl_div_cor_adt_mrna_filter
)
rownames(dat_kl) <- names(kl_div_cor_adt_mrna_filter)
dat_kl[1:4,]
fig.size(7,8)
ggplot(data = dat_kl, aes(x = before, y = after, label = rownames(dat_kl))) +
# stat_smooth(method = "lm", formula = y ~ x, se = FALSE) +
geom_abline(intercept = 0, slope = 1, color = "blue") +
geom_point() +
coord_cartesian(xlim = c(0, 1.25), ylim = c(0, 1.25)) +
# xlab("") +
# ylab("") +
geom_text_repel(hjust = 0.5) +
theme_classic(base_size = 25)+
theme(
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=25, face = "italic"), # face="bold.italic"
# legend.position = "none"
)
| before | after | |
|---|---|---|
| <dbl> | <dbl> | |
| CD107a/LAMP1_prot | 0.51662735 | 0.53002044 |
| CD112/Nectin-2_prot | 0.78644584 | 0.78004980 |
| CD119/IFN-gamma-R-alpha-chain_prot | 0.05630272 | 0.06511971 |
| CD11b_prot | 1.18662327 | 1.18633181 |
rownames(adt_exprs_norm)
# exprs_norm <- readRDS("qc_mRNA_exprs_2020-01-24.rds")
# meta_all <- readRDS("qc_mRNA_protein_pca_umap_clus_meta_all_anno_hpca_2020-02-20.rds")
# adt_exprs_norm_filter <- readRDS("adt_exprs_norm_filter_2020-02-18.rds")
# Matched gene names with rownames(adt_exprs_norm)
# matched_genes <- c("", "", "CD19", "", "",
# "", "CD226", "TNFSF11", "", "", "", "",
# "", "", "", "", "", "","", "", "", "",
# "CDH5", "", "MERTK", "", "CXCR5", "", "", "HAVCR2", "IGHD", "")
matched_genes <- c("LAMP1", "NECTIN2", "IFNGR1", "ITGAM", "ITGAX",
"IL7R", "PDGFRA", "THBD", "CDH11", "MCAM",
"CD14", "PVR", "KLRB1", "CD163", "FCGR3A",
"CCR2", "CCR5", "CCR6", "CD19", "CD1C",
"MRC1", "CD209", "MS4A1", "CR2", "CD226",
"CD24", "PDCD1LG2", "CD274", "ICOS", "PDCD1",
"CD27", "NRP1", "KDR", "KLRK1", "PECAM1",
"CD34", "CD38", "CD3D", "CD44", "PTPRC",
"PTPRC", "PTPRC", "CD4", "CD55", "NCAM1",
"FCGR1A", "CD68", "CD69", "CD86", "CD8A",
"THY1", "CX3CR1", "EGFR", "FOLR2", "HLA-DRA",
"IGHG1", "IGHM", "PDPN"
)
length(matched_genes)
matched_genes[which((!matched_genes %in% row.names(exprs_norm)))]
# Check expression of gene expression
myplots <- list()
for (i in 1:20) {
gene <- matched_genes[i]
max.cutoff = quantile(exprs_norm[gene,], .99)
min.cutoff = quantile(exprs_norm[gene,], .01)
tmp <- sapply(X = exprs_norm[gene,], FUN = function(x) {
return(ifelse(test = x > max.cutoff, yes = max.cutoff,
no = x))
})
tmp <- sapply(X = tmp, FUN = function(x) {
return(ifelse(test = x < min.cutoff, yes = min.cutoff,
no = x))
})
meta_all$gene <- as.numeric(tmp)
ind <- paste("p", i, sep = "")
ind <- ggplot(
data = meta_all[sample(nrow(meta_all)),],
# data = meta_all[order(meta_all$gene),] ,
aes(x = harmonized_UMAP1, y = harmonized_UMAP2)) +
geom_point(mapping = aes(color = gene), size = 0.1) +
scale_color_viridis(option = "viridis", end = .9) +
labs(x="", y="") +
theme_bw(base_size = 15)+
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30, face = "italic"), # face="bold.italic"
legend.position = "none") +
labs(title = gene)
myplots[[i]] <- ind
}
options(repr.plot.height = 15, repr.plot.width = 20)
p <- do.call("grid.arrange", c(myplots, ncol = 5))
# Check expression of gene expression
matched_genes <- c("CD3D", "CD4")
myplots <- list()
for (i in 1:length(matched_genes)) {
gene <- matched_genes[i]
max.cutoff = quantile(exprs_norm[gene,], .99)
min.cutoff = quantile(exprs_norm[gene,], .01)
tmp <- sapply(X = exprs_norm[gene,], FUN = function(x) {
return(ifelse(test = x > max.cutoff, yes = max.cutoff,
no = x))
})
tmp <- sapply(X = tmp, FUN = function(x) {
return(ifelse(test = x < min.cutoff, yes = min.cutoff,
no = x))
})
meta_all$gene <- as.numeric(tmp)
ind <- paste("p", i, sep = "")
ind <- ggplot(
data = meta_all[sample(nrow(meta_all)),],
# data = meta_all[order(meta_all$gene),] ,
aes(x = harmonized_UMAP1, y = harmonized_UMAP2)) +
geom_point(mapping = aes(color = gene), size = 0.1) +
scale_color_viridis(option = "viridis", end = .9) +
labs(x="", y="") +
theme_bw(base_size = 15)+
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30, face = "italic"), # face="bold.italic"
legend.position = "none") +
labs(title = gene)
myplots[[i]] <- ind
}
options(repr.plot.height = 3, repr.plot.width = 20)
p <- do.call("grid.arrange", c(myplots, ncol = 5))
matched_genes %in% rownames(exprs_norm)
# Check expression of gene expression
gene_plot <- c("CX3CR1_prot")
myplots <- list()
for (i in 1) {
gene <- gene_plot[i]
max.cutoff = quantile(adt_exprs_norm_filter[gene,], .99)
min.cutoff = quantile(adt_exprs_norm_filter[gene,], .01)
tmp <- sapply(X = adt_exprs_norm_filter[gene,], FUN = function(x) {
return(ifelse(test = x > max.cutoff, yes = max.cutoff,
no = x))
})
tmp <- sapply(X = tmp, FUN = function(x) {
return(ifelse(test = x < min.cutoff, yes = min.cutoff,
no = x))
})
meta_all$gene <- as.numeric(tmp)
ind <- paste("p", i, sep = "")
ind <- ggplot(
# data = meta_all[sample(nrow(meta_all)),],
data = meta_all[order(meta_all$gene),] ,
aes(x = harmonized_UMAP1, y = harmonized_UMAP2)) +
geom_point(mapping = aes(color = gene), size = 0.1) +
scale_color_viridis(option = "viridis", end = .9) +
labs(x="", y="")+
theme_bw(base_size = 15)+
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30, face = "italic"), # face="bold.italic"
legend.position = "none") +
labs(title = gene)
myplots[[i]] <- ind
}
options(repr.plot.height = 4, repr.plot.width = 20)
p <- do.call("grid.arrange", c(myplots, ncol = 5))
# Check expression of gene expression
gene_plot <- c("IL1B", "VSIG4", "CX3CR1")
myplots <- list()
for (i in 1:3) {
gene <- gene_plot[i]
max.cutoff = quantile(exprs_norm[gene,], .99)
min.cutoff = quantile(exprs_norm[gene,], .01)
tmp <- sapply(X = exprs_norm[gene,], FUN = function(x) {
return(ifelse(test = x > max.cutoff, yes = max.cutoff,
no = x))
})
tmp <- sapply(X = tmp, FUN = function(x) {
return(ifelse(test = x < min.cutoff, yes = min.cutoff,
no = x))
})
meta_all$gene <- as.numeric(tmp)
ind <- paste("p", i, sep = "")
ind <- ggplot(
data = meta_all[sample(nrow(meta_all)),],
# data = meta_all[order(meta_all$gene),] ,
aes(x = harmonized_UMAP1, y = harmonized_UMAP2)) +
geom_point(mapping = aes(color = gene), size = 0.1) +
scale_color_viridis(option = "viridis", end = .9) +
labs(x="", y="")+
theme_bw(base_size = 15)+
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30, face = "italic"), # face="bold.italic"
legend.position = "none") +
labs(title = gene)
myplots[[i]] <- ind
}
options(repr.plot.height = 4, repr.plot.width = 20)
p <- do.call("grid.arrange", c(myplots, ncol = 5))
# Check expression of gene expression
myplots <- list()
for (i in 41:length(matched_genes)) {
gene <- matched_genes[i]
max.cutoff = quantile(exprs_norm[gene,], .99)
min.cutoff = quantile(exprs_norm[gene,], .01)
tmp <- sapply(X = exprs_norm[gene,], FUN = function(x) {
return(ifelse(test = x > max.cutoff, yes = max.cutoff,
no = x))
})
tmp <- sapply(X = tmp, FUN = function(x) {
return(ifelse(test = x < min.cutoff, yes = min.cutoff,
no = x))
})
meta_all$gene <- as.numeric(tmp)
ind <- paste("p", i, sep = "")
ind <- ggplot(
data = meta_all[sample(nrow(meta_all)),],
# data = meta_all[order(meta_all$gene),] ,
aes(x = harmonized_UMAP1, y = harmonized_UMAP2)) +
geom_point(mapping = aes(color = gene), size = 0.1) +
scale_color_viridis(option = "viridis", end = .9) +
labs(x="", y="")+
theme_bw(base_size = 15)+
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30, face = "italic"), # face="bold.italic"
legend.position = "none") +
labs(title = gene)
myplots[[i]] <- ind
}
options(repr.plot.height = 15, repr.plot.width = 20)
p <- do.call("grid.arrange", c(myplots, ncol = 5))
cor_expr_before <- sapply(which(matched_genes %in% row.names(exprs_norm)), function(x){cor(exprs_norm[matched_genes[x],], adt_exprs_norm[x,], method = "spearman")})
names(cor_expr_before) <- matched_genes[which(matched_genes %in% row.names(exprs_norm))]
mean_exp_before <- apply(exprs_norm[matched_genes,], 1, function(x){mean(x[x!=0])})
fig.size(7,8)
ggplot(data = as.data.frame(cor_expr_before[!is.na(cor_expr_before)]), aes(x = mean_exp_before[which(matched_genes %in% row.names(exprs_norm))][!is.na(cor_expr_before)],
y = cor_expr_before[!is.na(cor_expr_before)],
label = sub('\\/.*', '', sub('\\_.*', '', rownames(adt_exprs_norm)))
)) +
stat_smooth(method = "lm", formula = y ~ x, se = FALSE) +
geom_point() +
xlab("Average mRNA expression") +
ylab("Spearman correlation between\n mRNA and protein") +
geom_text_repel() +
theme_classic(base_size = 25)+
theme(
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=25, face = "italic"), # face="bold.italic"
legend.position = "none"
)
cor_expr <- sapply(which(matched_genes %in% row.names(exprs_norm)), function(x){cor(exprs_norm[matched_genes[x],], adt_exprs_norm_filter[x,], method = "spearman")})
names(cor_expr) <- matched_genes[which(matched_genes %in% row.names(exprs_norm))]
mean_exp <- apply(exprs_norm[matched_genes,], 1, function(x){mean(x[x!=0])})
fig.size(7,8)
ggplot(data = as.data.frame(cor_expr[!is.na(cor_expr)]), aes(x = mean_exp[which(matched_genes %in% row.names(exprs_norm))][!is.na(cor_expr)],
y = cor_expr[!is.na(cor_expr)],
label = sub('\\/.*', '', sub('\\_.*', '', rownames(adt_exprs_norm)))
)) +
stat_smooth(method = "lm", formula = y ~ x, se = FALSE) +
geom_point() +
xlab("Average mRNA expression") +
ylab("Spearman correlation between\n mRNA and protein") +
geom_text_repel() +
theme_classic(base_size = 25)+
theme(
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=25, face = "italic"), # face="bold.italic"
legend.position = "none"
)
length(cor_expr_before)
length(cor_expr)
fig.size(7,8)
ggplot(data = as.data.frame(cor_expr[!is.na(cor_expr)]), aes(x = cor_expr_before,
y = cor_expr[!is.na(cor_expr)],
label = sub('\\/.*', '', sub('\\_.*', '', rownames(adt_exprs_norm)))
)) +
stat_smooth(method = "lm", formula = y ~ x, se = FALSE) +
geom_point() +
xlab("Correlation (before QC)") +
ylab("Correlation (after QC)") +
geom_text_repel() +
theme_classic(base_size = 25)+
theme(
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=25, face = "italic"), # face="bold.italic"
legend.position = "none"
)
fig.size(7,8)
ggplot(data = as.data.frame(cor_expr[!is.na(cor_expr)]), aes(x = dat_kl$before,
y = cor_expr[!is.na(cor_expr)],
label = sub('\\/.*', '', sub('\\_.*', '', rownames(adt_exprs_norm)))
)) +
stat_smooth(method = "lm", formula = y ~ x, se = FALSE) +
geom_point() +
# xlab("Average mRNA expression") +
# ylab("Spearman correlation between\n mRNA and protein") +
geom_text_repel() +
theme_classic(base_size = 25)+
theme(
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=25, face = "italic"), # face="bold.italic"
legend.position = "none"
)
ggplot(data = as.data.frame(cor_expr[!is.na(cor_expr)]), aes(x = dat_kl$after,
y = cor_expr[!is.na(cor_expr)],
label = sub('\\/.*', '', sub('\\_.*', '', rownames(adt_exprs_norm)))
)) +
stat_smooth(method = "lm", formula = y ~ x, se = FALSE) +
geom_point() +
xlab("KL divergence (cluster specificity)") +
ylab("Spearman correlation between\n mRNA and protein") +
geom_text_repel() +
theme_classic(base_size = 25)+
theme(
# axis.text = element_blank(),
# axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=25, face = "italic"), # face="bold.italic"
legend.position = "none"
)
meta_all[1:4,]
| cell | sample | nUMI | nGene | percent_mito | percent.ribo | UMAP1 | UMAP2 | PC1 | PC2 | ⋯ | harmonized_PC19 | harmonized_PC20 | harmonized_UMAP1 | harmonized_UMAP2 | gene | res_0.20 | res_0.40 | res_0.60 | res_0.80 | res_1.00 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <chr> | <chr> | <dbl> | <int> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | ⋯ | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <int> | <int> | <int> | <int> | <int> | |
| 1 | BRI-399_AAACCCAAGTCATTGC | BRI-399 | 7959 | 2497 | 0.15165222 | 0.13041839 | 6.6981000 | -5.381405 | -0.3571995 | -0.0390370210 | ⋯ | -0.0004747563 | -0.005633132 | 11.135468 | -0.7312793 | 0.000000 | 1 | 1 | 0 | 3 | 2 |
| 2 | BRI-399_AAACCCAAGTGAATAC | BRI-399 | 8601 | 3055 | 0.06522497 | 0.07836298 | 0.4673366 | -9.940085 | -0.2261091 | -0.0163840411 | ⋯ | 0.0956605894 | -0.356151813 | 2.863655 | -10.3395105 | 0.000000 | 13 | 18 | 19 | 20 | 21 |
| 4 | BRI-399_AAACCCACAGTCCGTG | BRI-399 | 13897 | 3833 | 0.07922573 | 0.14082176 | 6.9872802 | -7.038188 | -0.4850761 | 0.0008679194 | ⋯ | 0.0117411423 | 0.027152254 | 10.381248 | -1.9383886 | 1.105236 | 1 | 1 | 0 | 3 | 8 |
| 5 | BRI-399_AAACCCAGTAGGAGGG | BRI-399 | 33644 | 5515 | 0.08474022 | 0.18550113 | 4.7951625 | 3.897222 | 0.2578570 | 0.3665536689 | ⋯ | -0.0036981598 | -0.004332571 | 1.240208 | 2.9091845 | 0.000000 | 2 | 2 | 6 | 13 | 14 |
# Scale expression to z-scores across cell types
adt_exprs_scaled <- adt_exprs_norm_filter[rowSums(as.matrix(adt_exprs_norm_filter)) > 0,] %>% singlecellmethods::scaleData()
adt_pca_res <- irlba::prcomp_irlba(t(adt_exprs_scaled), 20)
# Harmonization: sample and tissue
options(repr.plot.height = 3, repr.plot.width = 5)
harmony <- HarmonyMatrix(adt_pca_res$x, meta_all, "sample", theta = 2,
do_pca = FALSE,
# lambda = 1,
# tau = 10,
epsilon.cluster = -Inf,
epsilon.harmony = -Inf,
max.iter.cluster = 30,
max.iter.harmony = 20,
plot_convergence = T
)
Warning message: “Quick-TRANSfer stage steps exceeded maximum (= 20179800)” Warning message: “Quick-TRANSfer stage steps exceeded maximum (= 20179800)” Warning message: “Quick-TRANSfer stage steps exceeded maximum (= 20179800)” Warning message: “Quick-TRANSfer stage steps exceeded maximum (= 20179800)” Warning message: “Quick-TRANSfer stage steps exceeded maximum (= 20179800)” Warning message: “Quick-TRANSfer stage steps exceeded maximum (= 20179800)” Harmony 1/20 Harmony 2/20 Harmony 3/20 Harmony 4/20 Harmony 5/20 Harmony 6/20 Harmony 7/20 Harmony 8/20 Harmony 9/20 Harmony 10/20 Harmony 11/20 Harmony 12/20 Harmony 13/20 Harmony 14/20 Harmony 15/20 Harmony 16/20 Harmony 17/20 Harmony 18/20 Harmony 19/20 Harmony 20/20
colnames(harmony) <- paste0("protein_harmonized_", colnames(harmony), sep="")
meta_all <- cbind(meta_all, harmony)
saveRDS(meta_all, "qc_mRNA_protein_meta_all_2020-02-19.rds")
# UMAP with top 20 PCs
# adt_umap_res <- umap(harmony, n_neighbors = 50, metric = "cosine", min_dist = .3)
# meta_all <- readRDS("qc_mRNA_protein_meta_all_2020-02-19.rds")
adt_umap_res <- umap(meta_all[,c(57:66)], n_neighbors = 50, metric = "cosine", min_dist = .3)
meta_all$UMAP1_adt <- adt_umap_res$layout[, 1]
meta_all$UMAP2_adt <- adt_umap_res$layout[, 2]
# saveRDS(meta_all, "qc_mRNA_protein_meta_all_2020-02-20.rds")
library(parallel)
# Louvain
meta_all <- readRDS("qc_mRNA_protein_meta_all_2020-02-20.rds")
snn_pcs <- BuildSNNSeurat(meta_all[, c(57:66)], nn.eps = .5)
resolution_list <- c(0.2, 0.4)
ids_cos <- Reduce(cbind, mclapply(resolution_list, function(res_use) {
Seurat:::RunModularityClustering(SNN = snn_pcs, modularity = 1,
resolution = res_use, algorithm = 3, n.start = 10,
n.iter = 10, random.seed = 0, print.output = FALSE,
temp.file.location = NULL, edge.file.name = NULL)
}, mc.cores = min(16, length(resolution_list))))
ids_cos %<>% data.frame()
colnames(ids_cos) <- sprintf("res_%.2f", resolution_list)
meta_all$res_0.20_adt <- ids_cos$res_0.20
meta_all$res_0.40_adt <- ids_cos$res_0.40
# meta_all$res_0.60_adt <- ids_cos$res_0.60
# meta_all$res_0.80_adt <- ids_cos$res_0.80
# meta_all$res_1.00_adt <- ids_cos$res_1.00
# saveRDS(meta_all, "qc_mRNA_protein_pca_umap_clus_meta_all_2020-02-20.rds")
meta_all <- readRDS("qc_mRNA_protein_pca_umap_clus_meta_all_2020-02-20.rds")
meta_all$res_0.20_adt <- as.character(meta_all$res_0.20_adt)
meta_all$res_0.40_adt <- as.character(meta_all$res_0.40_adt)
cluster_center <- meta_all %>%
group_by(res_0.20_adt) %>%
summarise_at(vars(UMAP1_adt, UMAP2_adt), funs(median(., na.rm=TRUE)))
cluster_center <- as.data.frame(cluster_center)
cluster_center$res_0.20_adt <- as.character(cluster_center$res_0.20_adt)
options(repr.plot.height = 8, repr.plot.width = 12)
ggplot(meta_all[sample(nrow(meta_all)),],
aes(x = UMAP1_adt, y = UMAP2_adt, fill= res_0.20_adt)
) +
geom_point(size = 0.6, stroke = 0.0001, shape = 21, alpha = 0.6) +
geom_label_repel(
data = cluster_center,
aes(label = res_0.20_adt, fill = res_0.20_adt),
# fontface = 'bold',
size = 10,
box.padding = unit(0.2, "lines"),
point.padding = unit(0.2, "lines"),
segment.color = 'grey50'
) +
scale_fill_manual(values = meta_colors$cluster, name = "") +
labs(
x = "UMAP1_adt",
y = "UMAP2_adt"
) +
theme_bw(base_size = 30) +
theme(
legend.position = "none",
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30, face = "italic")
)
options(repr.plot.height = 8, repr.plot.width = 12)
ggplot() +
geom_point(
data = meta_all[sample(nrow(meta_all)),],
mapping = aes_string(x = "UMAP1_adt", y = "UMAP2_adt", fill = "sample"),
size = 0.5, stroke = 0.0001, shape = 21
) +
# scale_fill_manual(values = meta_colors$sample, name = "") +
labs(
x = "UMAP1_adt",
y = "UMAP2_adt"
# title = paste0("Clustering of ", nrow(meta_all), " cells on proteins; \nColored by ", nrow(as.data.frame(table(meta_all$sample))), " samples")
) +
theme_bw(base_size = 30) +
theme(
legend.position = "none",
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30)
)
meta_all$res_0.20 <- as.character(meta_all$res_0.20)
meta_all$res_0.40 <- as.character(meta_all$res_0.40)
meta_all$res_0.60 <- as.character(meta_all$res_0.60)
set3 = c("#8DD3C7", "#FFFFB3", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#D9D9D9", "#BC80BD", "#CCEBC5", "#FFED6F")
set2 = c("#66C2A5", "#FC8D62", "#8DA0CB", "#E78AC3", "#A6D854", "#FFD92F", "#E5C494", "#B3B3B3")
colors37 = c("#466791","#60bf37","#953ada","#4fbe6c","#ce49d3","#a7b43d","#5a51dc","#d49f36","#552095","#507f2d","#db37aa","#84b67c","#a06fda","#df462a","#5b83db","#c76c2d","#4f49a3","#82702d","#dd6bbb","#334c22","#d83979","#55baad","#dc4555","#62aad3","#8c3025","#417d61","#862977","#bba672","#403367","#da8a6d","#a79cd4","#71482c","#c689d0","#6b2940","#d593a7","#895c8b","#bd5975")
cluster_center <- meta_all %>%
group_by(res_0.60) %>%
summarise_at(vars(harmonized_UMAP1, harmonized_UMAP2), funs(median(., na.rm=TRUE)))
cluster_center <- as.data.frame(cluster_center)
cluster_center$res_0.60 <- as.character(cluster_center$res_0.60)
options(repr.plot.height = 8, repr.plot.width = 12)
ggplot(meta_all[sample(nrow(meta_all)),],
aes(x = harmonized_UMAP1, y = harmonized_UMAP2, fill= res_0.60)
) +
geom_point(size = 0.4, stroke = 0.0001, shape = 21, alpha = 0.7) +
geom_label_repel(
data = cluster_center,
aes(label = res_0.60, fill = res_0.60),
# fontface = 'bold',
size = 7,
box.padding = unit(0.2, "lines"),
point.padding = unit(0.2, "lines"),
segment.color = 'grey50'
) +
scale_fill_manual(values = colors37, name = "") +
labs(
x = "UMAP1_mRNA",
y = "UMAP2_mRNA"
) +
theme_bw(base_size = 30) +
theme(
legend.position = "none",
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30, face = "italic")
)
# test_meta <- readRDS("qc_mRNA_protein_pca_umap_clus_meta_all_site_anno_hpca_2020-02-23.rds")
test_meta[1:4,]
dim(test_meta)
colnames(test_meta)
| cell | sample | nUMI | nGene | percent_mito | percent.ribo | UMAP1 | UMAP2 | PC1 | PC2 | ⋯ | protein_harmonized_PC18 | protein_harmonized_PC19 | protein_harmonized_PC20 | UMAP1_adt | UMAP2_adt | res_0.20_adt | res_0.40_adt | hpca_anno | hpca_anno_fine | site | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <chr> | <chr> | <dbl> | <int> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | ⋯ | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <int> | <int> | <chr> | <chr> | <chr> | |
| 1 | BRI-399_AAACCCAAGTCATTGC | BRI-399 | 7959 | 2497 | 0.15165222 | 0.13041839 | 6.6981000 | -5.381405 | -0.3571995 | -0.0390370210 | ⋯ | 0.2740286 | 0.57389824 | -0.3256205 | 0.9773253 | 6.577851 | 1 | 0 | Chondrocytes:MSC-derived | Chondrocytes:MSC-derived | Pittsburgh |
| 2 | BRI-399_AAACCCAAGTGAATAC | BRI-399 | 8601 | 3055 | 0.06522497 | 0.07836298 | 0.4673366 | -9.940085 | -0.2261091 | -0.0163840411 | ⋯ | -0.3393538 | 1.04008517 | -0.8825261 | -6.2643149 | 5.037686 | 4 | 4 | iPS_cells:adipose_stem_cells | iPS_cells:adipose_stem_cells | Pittsburgh |
| 4 | BRI-399_AAACCCACAGTCCGTG | BRI-399 | 13897 | 3833 | 0.07922573 | 0.14082176 | 6.9872802 | -7.038188 | -0.4850761 | 0.0008679194 | ⋯ | 2.8541640 | -0.25334417 | -0.2337399 | -2.2928325 | -4.808373 | 7 | 5 | Smooth_muscle_cells:bronchial | Smooth_muscle_cells:bronchial | Pittsburgh |
| 5 | BRI-399_AAACCCAGTAGGAGGG | BRI-399 | 33644 | 5515 | 0.08474022 | 0.18550113 | 4.7951625 | 3.897222 | 0.2578570 | 0.3665536689 | ⋯ | -1.3839181 | 0.02315231 | 0.1476639 | -4.0102350 | -5.385874 | 0 | 5 | Monocyte:CD16- | Monocyte:CD16- | Pittsburgh |
# Check expression of gene expression
plot_genes <- c("IL1B", "VSIG4", "NCAM1", "CD14", "PRG4", "THY1", "NOTCH3", "TAGLN", "FTL")
myplots <- list()
for (i in 1:length(plot_genes)) {
gene <- plot_genes[i]
max.cutoff = quantile(exprs_norm[gene,], .99)
min.cutoff = quantile(exprs_norm[gene,], .01)
tmp <- sapply(X = exprs_norm[gene,], FUN = function(x) {
return(ifelse(test = x > max.cutoff, yes = max.cutoff,
no = x))
})
tmp <- sapply(X = tmp, FUN = function(x) {
return(ifelse(test = x < min.cutoff, yes = min.cutoff,
no = x))
})
meta_all$gene <- as.numeric(tmp)
ind <- paste("p", i, sep = "")
ind <- ggplot(
data = meta_all[sample(nrow(meta_all)),],
# data = meta_all[order(meta_all$gene),] ,
aes(x = harmonized_UMAP1, y = harmonized_UMAP2)) +
geom_point(mapping = aes(color = gene), size = 0.1) +
scale_color_viridis(option = "viridis", end = .9) +
labs(x="", y="") +
theme_bw(base_size = 15)+
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30, face = "italic"), # face="bold.italic"
legend.position = "none") +
labs(title = gene)
myplots[[i]] <- ind
}
options(repr.plot.height = 10, repr.plot.width = 20)
p <- do.call("grid.arrange", c(myplots, ncol = 5))
# library(presto)
# genes_exclude <- grep("^MT-|^RPL|^RPS|MALAT1|MIR-", row.names(exprs_norm), value = TRUE)
# vargenes_df <- FindVariableGenesBatch(exprs_norm, meta_all, genes_exclude, 1e3)
# var_genes <- vargenes_df$gene
# length(var_genes)
# dim(exprs_norm[var_genes,])
# mRNA_marker <- wilcoxauc(exprs_norm[var_genes,], meta_all$res_0.40)
# saveRDS(mRNA_marker, "mRNA_marker_2020-02-21.rds")
mRNA_marker <- readRDS("mRNA_marker_2020-02-21.rds")
x <- mRNA_marker %>%
group_by(group) %>%
top_n(4, wt = logFC)
x
# x[which(x$feature == "CD38"),]
| feature | group | avgExpr | logFC | statistic | auc | pval | padj | pct_in | pct_out |
|---|---|---|---|---|---|---|---|---|---|
| IL7R | 0 | 6.928786 | 5.571822 | 18128259804 | 0.8464536 | 0 | 0 | 85.80338 | 21.912409 |
| IL32 | 0 | 9.681161 | 6.947141 | 17829527319 | 0.8325050 | 0 | 0 | 93.71868 | 35.496254 |
| CD52 | 0 | 7.621959 | 4.509784 | 17002535663 | 0.7938907 | 0 | 0 | 95.22378 | 43.671917 |
| LTB | 0 | 7.973223 | 6.404569 | 18157646382 | 0.8478257 | 0 | 0 | 88.60516 | 26.569585 |
| DCN | 1 | 94.241282 | 86.711190 | 19062177412 | 0.9599347 | 0 | 0 | 99.77159 | 35.409711 |
| COL1A2 | 1 | 89.585211 | 79.447059 | 18501536105 | 0.9317019 | 0 | 0 | 99.65651 | 40.829011 |
| COL3A1 | 1 | 101.188206 | 91.150735 | 18346270704 | 0.9238830 | 0 | 0 | 98.21802 | 42.191634 |
| PLA2G2A | 1 | 109.064235 | 90.133908 | 17376500185 | 0.8750472 | 0 | 0 | 93.51374 | 37.800222 |
| HLA-DRA | 2 | 194.479762 | 161.287278 | 17950069992 | 0.9096299 | 0 | 0 | 99.57837 | 72.684136 |
| TMSB4X | 2 | 209.645691 | 129.500456 | 16326743537 | 0.8273669 | 0 | 0 | 99.99649 | 99.491165 |
| FTL | 2 | 604.890482 | 525.762188 | 18564162663 | 0.9407494 | 0 | 0 | 99.99824 | 98.581088 |
| FTH1 | 2 | 430.724377 | 357.782700 | 18243601714 | 0.9245048 | 0 | 0 | 99.99649 | 98.716662 |
| CCL5 | 3 | 19.178906 | 17.627162 | 15235513639 | 0.9483510 | 0 | 0 | 97.14777 | 20.708760 |
| GNLY | 3 | 10.608617 | 10.121637 | 10635561684 | 0.6620220 | 0 | 0 | 36.71268 | 5.691370 |
| NKG7 | 3 | 14.042169 | 13.348548 | 15281549797 | 0.9512166 | 0 | 0 | 95.37657 | 12.812967 |
| CCL4 | 3 | 15.790807 | 12.461548 | 12713043758 | 0.7913372 | 0 | 0 | 78.22746 | 26.249080 |
| CLU | 4 | 149.074822 | 139.197125 | 11822639039 | 0.9451929 | 0 | 0 | 99.00967 | 45.039471 |
| PLA2G2A | 4 | 128.258462 | 105.349660 | 11036857071 | 0.8823714 | 0 | 0 | 96.56487 | 41.065638 |
| PRG4 | 4 | 652.064091 | 635.092852 | 12060230090 | 0.9641878 | 0 | 0 | 98.79386 | 45.935165 |
| FN1 | 4 | 441.889822 | 395.331116 | 11747129505 | 0.9391561 | 0 | 0 | 99.95861 | 74.473252 |
| SPARCL1 | 5 | 46.296680 | 40.259461 | 10865981172 | 0.9341407 | 0 | 0 | 99.30856 | 34.050387 |
| ACKR1 | 5 | 17.353468 | 17.156878 | 10147754668 | 0.8723953 | 0 | 0 | 76.11959 | 3.829927 |
| IGFBP7 | 5 | 56.654662 | 51.978153 | 11000476625 | 0.9457032 | 0 | 0 | 98.67793 | 43.156433 |
| VIM | 5 | 112.397100 | 26.463313 | 7741144123 | 0.6655007 | 0 | 0 | 99.88156 | 97.612775 |
| HLA-DPB1 | 6 | 79.238277 | 60.894141 | 8081533080 | 0.8115068 | 0 | 0 | 96.53435 | 71.939904 |
| LYZ | 6 | 89.253011 | 70.272251 | 8645010032 | 0.8680883 | 0 | 0 | 95.60261 | 41.211154 |
| HLA-DRA | 6 | 159.002500 | 110.276476 | 8238292782 | 0.8272478 | 0 | 0 | 99.27657 | 74.881361 |
| FTH1 | 6 | 212.508295 | 95.343038 | 7745199790 | 0.7777339 | 0 | 0 | 99.85986 | 98.829780 |
| CD69 | 7 | 10.229652 | 7.260963 | 6394926893 | 0.7165371 | 0 | 0 | 74.62413 | 40.741140 |
| CD52 | 7 | 9.772605 | 6.326130 | 7276674418 | 0.8153349 | 0 | 0 | 97.60637 | 48.864692 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| IGKC | 14 | 11761.09575 | 11282.68174 | 1251048448 | 0.8858655 | 0.000000e+00 | 0.000000e+00 | 93.20113 | 79.084951 |
| IGHG3 | 14 | 1192.11303 | 1145.87348 | 1251469835 | 0.8861639 | 0.000000e+00 | 0.000000e+00 | 90.02833 | 47.524159 |
| DCN | 15 | 75.46553 | 56.07106 | 1140239268 | 0.8649986 | 0.000000e+00 | 0.000000e+00 | 98.78530 | 44.109587 |
| COL3A1 | 15 | 80.06590 | 57.54520 | 1119884528 | 0.8495573 | 0.000000e+00 | 0.000000e+00 | 97.41877 | 49.764304 |
| PLA2G2A | 15 | 90.82053 | 59.56797 | 1081869948 | 0.8207190 | 0.000000e+00 | 0.000000e+00 | 94.50349 | 45.315923 |
| PRG4 | 15 | 128.31309 | 58.59022 | 962242027 | 0.7299679 | 0.000000e+00 | 0.000000e+00 | 84.14819 | 50.087559 |
| FAU | 16 | 36.50079 | 15.32681 | 978870008 | 0.7704349 | 0.000000e+00 | 0.000000e+00 | 99.96848 | 98.294803 |
| JUND | 16 | 33.51024 | 15.50006 | 939723249 | 0.7396238 | 0.000000e+00 | 0.000000e+00 | 99.87394 | 96.736451 |
| EEF1A1 | 16 | 104.00977 | 33.72632 | 896931385 | 0.7059438 | 0.000000e+00 | 0.000000e+00 | 100.00000 | 99.708308 |
| B2M | 16 | 137.82162 | 19.53500 | 839316902 | 0.6605974 | 6.782752e-214 | 4.714589e-213 | 100.00000 | 99.924829 |
| VIM | 17 | 216.85160 | 129.82325 | 968895745 | 0.8156574 | 0.000000e+00 | 0.000000e+00 | 100.00000 | 97.772015 |
| TMSB4X | 17 | 158.05261 | 60.08442 | 811989213 | 0.6835669 | 8.333735e-261 | 2.614603e-260 | 100.00000 | 99.559195 |
| FN1 | 17 | 140.03845 | 60.79206 | 835448189 | 0.7033156 | 0.000000e+00 | 0.000000e+00 | 97.47049 | 76.454892 |
| FTL | 17 | 240.83879 | 88.20654 | 810874234 | 0.6826282 | 3.064543e-258 | 9.533270e-258 | 100.00000 | 98.771937 |
| TAGLN | 18 | 61.20533 | 59.99027 | 852609519 | 0.9441668 | 0.000000e+00 | 0.000000e+00 | 93.37778 | 22.551863 |
| ADIRF | 18 | 33.13378 | 29.91193 | 793829020 | 0.8790742 | 0.000000e+00 | 0.000000e+00 | 91.60000 | 35.384182 |
| IGFBP7 | 18 | 81.93956 | 73.65044 | 846392786 | 0.9372825 | 0.000000e+00 | 0.000000e+00 | 98.75556 | 47.166285 |
| ACTA2 | 18 | 51.32800 | 51.15601 | 853263949 | 0.9448915 | 0.000000e+00 | 0.000000e+00 | 90.57778 | 9.842131 |
| SPARCL1 | 19 | 42.24943 | 33.16822 | 314051135 | 0.8881878 | 0.000000e+00 | 0.000000e+00 | 98.51936 | 38.971936 |
| IGFBP7 | 19 | 51.58998 | 42.98378 | 317833196 | 0.8988841 | 0.000000e+00 | 0.000000e+00 | 97.49431 | 47.344792 |
| EEF1A1 | 19 | 103.29727 | 32.82006 | 244672256 | 0.6919730 | 2.944802e-86 | 3.033063e-85 | 100.00000 | 99.709971 |
| B2M | 19 | 146.53986 | 28.16092 | 240102389 | 0.6790487 | 2.813345e-75 | 2.535458e-74 | 100.00000 | 99.925258 |
| PTGDS | 20 | 34.20860 | 28.97574 | 126948685 | 0.6772189 | 1.748264e-68 | 2.193982e-67 | 53.97849 | 24.657493 |
| GZMB | 20 | 54.07957 | 53.32593 | 185518174 | 0.9896629 | 0.000000e+00 | 0.000000e+00 | 98.70968 | 10.343288 |
| EEF1A1 | 20 | 101.27097 | 30.75780 | 136852638 | 0.7300524 | 4.059873e-66 | 4.910576e-65 | 100.00000 | 99.710268 |
| CD74 | 20 | 72.74409 | 28.99927 | 150924682 | 0.8051209 | 1.161800e-115 | 2.703758e-114 | 100.00000 | 83.806505 |
| IGFBP7 | 21 | 121.38562 | 112.77142 | 119457477 | 0.9679980 | 4.330753e-207 | 8.250869e-206 | 100.00000 | 47.414020 |
| PTMA | 21 | 140.07843 | 108.25893 | 114879242 | 0.9308993 | 3.128916e-150 | 4.245396e-149 | 100.00000 | 99.020804 |
| TMSB10 | 21 | 127.64706 | 80.03853 | 101107154 | 0.8193001 | 2.314967e-83 | 1.570505e-82 | 100.00000 | 98.986585 |
| VIM | 21 | 258.26471 | 170.41182 | 104305811 | 0.8452197 | 4.039356e-97 | 3.264332e-96 | 100.00000 | 97.786704 |
# exprs_norm <- readRDS("mRNA_exprs_norm_2020-01-24.rds")
# all(colnames(exprs_norm) == meta_all$cell)
## Heatmap of marker genes
gene_plot <- c(unique(x$feature), "THY1", "CD3D", "CD19", "PDPN", "TRDC", "MZB1", "CD27", "MCAM", "CD1C", "MKI67")
# gene_plot <- gene_plot[-which(gene_plot %in% genes_exclude)]
exp_heat <- exprs_norm[gene_plot,]
all(colnames(exp_heat) == meta_all$cell)
exp_heat <- t(exp_heat)
exp_heat <- as.data.frame(exp_heat)
all(rownames(exp_heat) == meta_all$cell)
exp_heat$res_0.60 <- as.character(meta_all$res_0.60)
dim(exp_heat)
exp_ave <- aggregate(exp_heat[, 1:(ncol(exp_heat)-1)], list(exp_heat$res_0.60), mean)
colnames(exp_ave)[1] <- "res_0.60"
exp_ave <- as.data.frame(t(exp_ave))
colnames(exp_ave) <- as.character(t(exp_ave[1,]))
exp_ave <- exp_ave[-1,]
row_names <- rownames(exp_ave)
exp_ave <- mutate_all(exp_ave, function(x) as.numeric(as.character(x)))
rownames(exp_ave) <- row_names
class(exp_ave[1,1])
exp_ave <- as.matrix(exp_ave)
mat_breaks <- seq(min(exp_ave), max(exp_ave), length.out = 10)
quantile_breaks <- function(xs, n = 10) {
breaks <- quantile(xs, probs = seq(0, 1, length.out = n))
breaks[!duplicated(breaks)]
}
mat_breaks <- quantile_breaks(exp_ave, n = 10)
# annotation_col <- meta_all[,c("res_0.60_anno", "diagnosis")]
# rownames(annotation_col) <- meta_all$cell
# d <- d[,order(annotation_col$res_0.60_anno)]
scale_rows <- function(x) t(scale(t(x)))
exp_ave_scale <- scale_rows(exp_ave) # Z-score
exp_ave_scale[exp_ave_scale > 2] <- 2
exp_ave_scale[exp_ave_scale < -2] <- -2
library(pheatmap)
options(repr.plot.width = 6, repr.plot.height = 9)
pheatmap(
mat = exp_ave_scale,
border_color = NA,
# breaks = mat_breaks,
color = magma(50),
show_rownames = TRUE,
show_colnames = TRUE,
cluster_rows = TRUE,
cluster_cols = TRUE,
cutree_rows = 6,
cutree_cols = 7,
# annotation_col = annotation_col,
# annotation_colors = meta_colors,
fontsize = 9,
fontsize_row = 9
# scale = "none"
)
# dev.copy(png,file = paste("heatmap_2019-06-25", ".png", sep = ""), width=5, height=12, units="in", res=300)
# dev.off()
# Run singleR
library(SingleR)
source("/data/srlab/ik936/Foxxy/utils/utils_singler.R")
# source("/data/srlab/ik936/SingleR/R/SingleR.R")
library(umap)
library(RcppArmadillo)
hpca$data <- hpca$data %>% t %>% scale %>% t
# hpca$data[1:4,1:4]
dim(hpca$data)
# blueprint_encode
genes_exclude <- grep("^MT-|^RPL|^RPS|MALAT1|MIR-", row.names(exprs_norm), value = TRUE)
vargenes_df <- FindVariableGenesBatch(exprs_norm, meta_all, genes_exclude, 0.2e3)
var_genes <- vargenes_df$gene
length(var_genes)
# http://comphealth.ucsf.edu/SupplementaryInformation1.html
# a correlation between the expression of a single cell (x-axis) and a reference sample (y-axis)
anno_hpca <- SingleR.CreateObject2(exprs_norm[var_genes, ],
# dcast_means,
hpca,
# immgen, # mouse
# blueprint_encode, # human
NULL, "Human", "", "10X",
do.main.types=FALSE, variable.genes="de",
fine.tune= FALSE # TRUE
)
saveRDS(anno_hpca, "anno_hpca_2020-02-21.rds")
[1] "Annotating data with HPCA..." [1] "Variable genes method: de" [1] "Number of DE genes:559" [1] "Number of cells: 403596"
anno_hpca <- readRDS("anno_hpca_moregenes_2020-02-21.rds")
meta_all <- readRDS("qc_mRNA_protein_pca_umap_clus_meta_all_2020-02-20.rds")
temp2 <- as.character(anno_hpca$SingleR.single$labels)
class(temp2)
length(temp2)
meta_all$hpca_anno <- temp2
meta_all$hpca_anno_fine <- meta_all$hpca_anno
length(table(meta_all$hpca_anno_fine))
remove <- names(table(meta_all$hpca_anno_fine))[which(as.numeric(table(meta_all$hpca_anno_fine)) < 120)]
meta_all$hpca_anno_fine[which(meta_all$hpca_anno_fine %in% remove)] <- ""
# table(meta_all$hpca_anno_fine)
# saveRDS(meta_all, "qc_mRNA_protein_pca_umap_clus_meta_all_anno_hpca_2020-02-20.rds")
plot_clusters3 <- function(cluster_ids, labels, pt_size = 14, umap_use = umap_post, do_labels = FALSE) {
cluster_table <- table(cluster_ids)
clusters_keep <- names(which(cluster_table > 20))
plt_df <- umap_use %>% data.frame() %>% cbind(cluster = cluster_ids) %>%
subset(cluster %in% clusters_keep)
plt <- plt_df %>%
ggplot(aes(X1, X2, col = factor(cluster))) + geom_point(shape = '.', alpha = .6) +
theme_tufte() + geom_rangeframe(col = "black") +
# theme(axis.line = element_line()) +
guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 21, size = 4))) +
scale_color_manual(values = singler.colors) +
labs(x = "UMAP 1", y = "UMAP 2") +
theme(plot.title = element_text(hjust = .5)) +
guides(col = FALSE)
if (do_labels)
plt <- plt + geom_label(data = data.table(plt_df)[, .(X1 = mean(X1), X2 = mean(X2)), by = cluster],
aes(label = cluster), size = pt_size, alpha = .8)
return(plt)
}
# Find cluster center
# This function provides normalized expression values 713 microarray samples of the Human Primary Cell Atlas (HPCA) (Mabbott et al., 2013).
# These 713 samples were processed and normalized as described in Aran, Looney and Liu et al. (2019)
# and each sample has been assigned to one of 37 main cell types and 157 subtypes.
set3 = c("#8DD3C7", "#FFFFB3", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#D9D9D9", "#BC80BD", "#CCEBC5", "#FFED6F")
set2 = c("#66C2A5", "#FC8D62", "#8DA0CB", "#E78AC3", "#A6D854", "#FFD92F", "#E5C494", "#B3B3B3")
colors37 = c("#466791","#60bf37","#953ada","#4fbe6c","#ce49d3","#a7b43d","#5a51dc","#d49f36","#552095","#507f2d","#db37aa","#84b67c","#a06fda","#df462a","#5b83db","#c76c2d","#4f49a3","#82702d","#dd6bbb","#334c22","#d83979","#55baad","#dc4555","#62aad3","#8c3025","#417d61","#862977","#bba672","#403367","#da8a6d","#a79cd4","#71482c","#c689d0","#6b2940","#d593a7","#895c8b","#bd5975")
cluster_center <- meta_all %>%
group_by(hpca_anno_fine) %>%
summarise_at(vars(harmonized_UMAP1, harmonized_UMAP2), funs(median(., na.rm=TRUE)))
cluster_center <- as.data.frame(cluster_center)
cluster_center$hpca_anno_fine <- as.character(cluster_center$hpca_anno_fine)
meta_all$hpca_anno_fine <- as.character(meta_all$hpca_anno_fine)
options(repr.plot.height = 10, repr.plot.width = 14)
ggplot(meta_all[sample(nrow(meta_all)),],
aes(x = harmonized_UMAP1, y = harmonized_UMAP2, fill= hpca_anno_fine)
) +
geom_point(size = 0.6, stroke = 0.0001, shape = 21, alpha = 0.6) +
geom_label_repel(
data = cluster_center,
aes(label = hpca_anno_fine, fill = hpca_anno_fine),
# fontface = 'bold',
size = 4,
box.padding = unit(0.5, "lines"),
point.padding = unit(0.7, "lines"),
segment.color = 'grey50'
) +
scale_fill_manual(values = c(sample(colors37), set3, set2), name = "") +
labs(
x = "UMAP1_mRNA",
y = "UMAP2_mRNA",
title = "Reference-based single-cell annotation using Human Primary Cell\nAtlas (HPCA) based on 37 main cell types and 157 subtypes (SingleR)"
) +
theme_bw(base_size = 20) +
theme(
legend.position = "none",
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=20)
)
options(repr.plot.height = 8, repr.plot.width = 12)
ggplot() +
geom_point(
data = meta_all[sample(nrow(meta_all)),],
mapping = aes_string(x = "harmonized_UMAP1", y = "harmonized_UMAP2", fill = "sample"),
size = 0.4, stroke = 0.0001, shape = 21
) +
# scale_fill_manual(values = meta_colors$sample, name = "") +
labs(
x = "UMAP1_mRNA",
y = "UMAP2_mRNA"
# title = paste0("Clustering of ", nrow(meta_all), " cells on proteins; \nColored by ", nrow(as.data.frame(table(meta_all$sample))), " samples")
) +
theme_bw(base_size = 30) +
theme(
legend.position = "none",
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30)
)
# Number of cells found in each pair of mRNA and protein clusters
# (e.g., number of cells in mRNA cluster 1 AND protein cluster 1)
mrna_adt_intersect <- matrix(nrow = length(unique(meta_all$res_0.60)), ncol = length(unique(meta_all$res_0.20_adt)))
dim(mrna_adt_intersect)
for(x in 1:length(unique(meta_all$res_0.60))) {
for(y in 1:length(unique(meta_all$res_0.20_adt))) {
mrna_adt_intersect[x,y] <- sum(meta_all$res_0.60 == (x-1) & meta_all$res_0.20_adt == (y-1))
}
}
class(mrna_adt_intersect)
# Quickly gave clusters labels based on marker genes and proteins identified by differential expression
row.names(mrna_adt_intersect) <- seq(0, nrow(mrna_adt_intersect)-1)
colnames(mrna_adt_intersect) <- seq(0, ncol(mrna_adt_intersect)-1)
# row_nam <- c()
col_nam <- c("M2 monocytes", "CD90+ fibroblasts", "CD4 T", "CD8 T", "Endothelial",
"B cell", "NK", "DC", "Plasma", "CX3CR1+ tissue-resident macrophage",
"CD09+ fibroblasts", "Endothelial", "Endothelial"
)
colnames(mrna_adt_intersect) <- paste0(colnames(mrna_adt_intersect), "_", col_nam, sep="")
# row.names(mrna_adt_intersect) <- c("THY1+ Fibroblast", "CD8+ T cell", "Myeloid", "CD4+ PD-1+ T cell", "Endothelial", "C1QA+ Myeloid", "Myeloid", "Myeloid", "Endothelial", "THY1+ Fibroblast", "PRG4+ Fibroblast", "Gamma delta T cell", "THY1+ CD34+ Fibroblast", "Endothelial", "Plasma cell", "B cell", "MT high T cell", "Proliferating T cell", "Endothelial")
# colnames(mrna_adt_intersect) <- c("C1QA+ Myeloid", "Endothelial", "CD8+ T cell", "THY1+ Fibroblast", "CD4+ T cell", "CD4+ PD-1+ T cell", "THY1+ Fibroblast", "THY1+ Fibroblast", "Myeloid", "Myeloid", "Myeloid", "Endothelial", "C1QA+ Myeloid", "HLA-high Myeloid", "B cell", "Gamma delta T cell", "Endothelial")
# heatmap with mRNA clusters in rows, protein clusters in columns
# scaled by column: most red square in a column is mRNA cluster with most cells from that protein cluster
library(pheatmap)
mat_breaks <- seq(min(mrna_adt_intersect), max(mrna_adt_intersect), length.out = 10)
quantile_breaks <- function(xs, n = 10) {
breaks <- quantile(xs, probs = seq(0, 1, length.out = n))
breaks[!duplicated(breaks)]
}
mat_breaks <- quantile_breaks(mrna_adt_intersect, n = 10)
scale_rows <- function(x) t(scale(t(x)))
mat <- scale_rows(mrna_adt_intersect) # Z-score
mat[mat > 2] <- 2
mat[mat < -2] <- -2
options(repr.plot.width = 15, repr.plot.height = 6)
pheatmap(
mat = t(mat),
border_color = NA,
# breaks = mat_breaks,
color = viridis(20),
show_rownames = TRUE,
show_colnames = TRUE,
cluster_rows = TRUE,
cluster_cols = TRUE,
cutree_rows = 9,
cutree_cols = 6,
# annotation_col = annotation_col,
# annotation_colors = meta_colors,
fontsize = 20,
fontsize_row = 20
# scale = "none"
)
# heatmap with mRNA clusters in rows, protein clusters in columns
# scaled by column: most red square in a column is mRNA cluster with most cells from that protein cluster
library(pheatmap)
mat_breaks <- seq(min(mrna_adt_intersect), max(mrna_adt_intersect), length.out = 10)
quantile_breaks <- function(xs, n = 10) {
breaks <- quantile(xs, probs = seq(0, 1, length.out = n))
breaks[!duplicated(breaks)]
}
mat_breaks <- quantile_breaks(mrna_adt_intersect, n = 10)
scale_rows <- function(x) t(scale(t(x)))
mat <- scale(mrna_adt_intersect) # Z-score
mat[mat > 2] <- 2
mat[mat < -2] <- -2
options(repr.plot.width = 15, repr.plot.height = 6)
pheatmap(
mat = t(mat),
border_color = NA,
# breaks = mat_breaks,
color = viridis(20),
show_rownames = TRUE,
show_colnames = TRUE,
cluster_rows = TRUE,
cluster_cols = TRUE,
cutree_rows = 6,
cutree_cols = 6,
# annotation_col = annotation_col,
# annotation_colors = meta_colors,
fontsize = 20,
fontsize_row = 20
# scale = "none"
)
# adt_exprs_norm <- readRDS("adt_exprs_norm_2020-01-25.rds")
# meta_all <- readRDS("qc_mRNA_protein_pca_umap_clus_meta_all_site_anno_hpca_2020-03-31.rds")
# all(colnames(adt_exprs_norm) == meta_all$cell)
plot_protein <- c("CD14_prot", "CD163_prot")
myplots <- list()
for (i in 1:length(plot_protein)) {
gene <- plot_protein[i]
max.cutoff = quantile(adt_exprs_norm[gene,], .99)
min.cutoff = quantile(adt_exprs_norm[gene,], .01)
tmp <- sapply(X = adt_exprs_norm[gene,], FUN = function(x) {
return(ifelse(test = x > max.cutoff, yes = max.cutoff,
no = x))
})
tmp <- sapply(X = tmp, FUN = function(x) {
return(ifelse(test = x < min.cutoff, yes = min.cutoff,
no = x))
})
meta_all$gene <- as.numeric(tmp)
ind <- paste("p", i, sep = "")
ind <- ggplot(
data = meta_all[sample(nrow(meta_all)),],
# data = meta_all[order(meta_all$gene),] ,
aes(x = UMAP1_adt, y = UMAP2_adt)) +
geom_point(mapping = aes(color = gene), size = 0.1) +
scale_color_viridis(option = "plasma", end = .9) +
labs(x="", y="")+
theme_bw(base_size = 15)+
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30, face = "italic"), # face="bold.italic"
legend.position = "none") +
labs(title = gene)
myplots[[i]] <- ind
}
options(repr.plot.height = 4.5, repr.plot.width = 25)
p <- do.call("grid.arrange", c(myplots, ncol = 6))
# 4: endothelial
# 0: monocytes
#
# “This is what the protein looks like, no filtering” to show that different markers have different levels of background
plot_protein <- rownames(adt_exprs_norm_filter)
myplots <- list()
for (i in 1:length(plot_protein)) {
gene <- plot_protein[i]
max.cutoff = quantile(adt_exprs_norm_filter[gene,], .99)
min.cutoff = quantile(adt_exprs_norm_filter[gene,], .01)
tmp <- sapply(X = adt_exprs_norm_filter[gene,], FUN = function(x) {
return(ifelse(test = x > max.cutoff, yes = max.cutoff,
no = x))
})
tmp <- sapply(X = tmp, FUN = function(x) {
return(ifelse(test = x < min.cutoff, yes = min.cutoff,
no = x))
})
meta_all$gene <- as.numeric(tmp)
ind <- paste("p", i, sep = "")
ind <- ggplot(
data = meta_all[sample(nrow(meta_all)),],
# data = meta_all[order(meta_all$gene),] ,
aes(x = UMAP1_adt, y = UMAP2_adt)) +
geom_point(mapping = aes(color = gene), size = 0.1) +
scale_color_viridis(option = "plasma", end = .9) +
labs(x="", y="")+
theme_bw(base_size = 15)+
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(color="black", size=30, face = "italic"), # face="bold.italic"
legend.position = "none") +
labs(title = gene)
myplots[[i]] <- ind
}
options(repr.plot.height = 40, repr.plot.width = 25)
p <- do.call("grid.arrange", c(myplots, ncol = 6))
# 4: endothelial
# 0: monocytes
#